@misc{achab_one-step_2023,
title = {One-{Step} {Distributional} {Reinforcement} {Learning}},
url = {http://arxiv.org/abs/2304.14421},
abstract = {Reinforcement learning (RL) allows an agent interacting sequentially with an environment to maximize its long-term expected return. In the distributional RL (DistrRL) paradigm, the agent goes beyond the limit of the expected value, to capture the underlying probability distribution of the return across all time steps. The set of DistrRL algorithms has led to improved empirical performance. Nevertheless, the theory of DistrRL is still not fully understood, especially in the control case. In this paper, we present the simpler one-step distributional reinforcement learning (OS-DistrRL) framework encompassing only the randomness induced by the one-step dynamics of the environment. Contrary to DistrRL, we show that our approach comes with a unified theory for both policy evaluation and control. Indeed, we propose two OS-DistrRL algorithms for which we provide an almost sure convergence analysis. The proposed approach compares favorably with categorical DistrRL on various environments.},
language = {en},
urldate = {2023-10-13},
publisher = {arXiv},
author = {Achab, Mastane and Alami, Reda and Djilali, Yasser Abdelaziz Dahou and Fedyanin, Kirill and Moulines, Eric},
month = apr,
year = {2023},
note = {arXiv:2304.14421 [cs, stat]},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@misc{kumar_image_2023,
title = {Image data augmentation approaches: a comprehensive survey and future directions},
shorttitle = {Image data augmentation approaches},
url = {http://arxiv.org/abs/2301.02830},
abstract = {Deep learning (DL) algorithms have shown significant performance in various computer vision tasks. However, having limited labelled data lead to a network overfitting problem, where network performance is bad on unseen data as compared to training data. Consequently, it limits performance improvement. To cope with this problem, various techniques have been proposed such as dropout, normalization and advanced data augmentation. Among these, data augmentation, which aims to enlarge the dataset size by including sample diversity, has been a hot topic in recent times. In this article, we focus on advanced data augmentation techniques. we provide a background of data augmentation, a novel and comprehensive taxonomy of reviewed data augmentation techniques, and the strengths and weaknesses (wherever possible) of each technique. We also provide comprehensive results of the data augmentation effect on three popular computer vision tasks, such as image classification, object detection and semantic segmentation. For results reproducibility, we compiled available codes of all data augmentation techniques. Finally, we discuss the challenges and difficulties, and possible future direction for the research community. We believe, this survey provides several benefits i) readers will understand the data augmentation working mechanism to fix overfitting problems ii) results will save the searching time of the researcher for comparison purposes. iii) Codes of the mentioned data augmentation techniques are available at https://github.com/kmr2017/Advanced-Data-augmentation-codes iv) Future work will spark interest in research community.},
language = {en},
urldate = {2023-05-09},
publisher = {arXiv},
author = {Kumar, Teerath and Mileo, Alessandra and Brennan, Rob and Bendechache, Malika},
month = mar,
year = {2023},
note = {arXiv:2301.02830 [cs]},
keywords = {\#nosource, Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, ⭐⭐⭐⭐},
}@misc{heExploringLargeLanguage2023,
title = {Exploring {{Large Language Models}} for {{Ontology Alignment}}},
author = {He, Yuan and Chen, Jiaoyan and Dong, Hang and Horrocks, Ian},
year = {2023},
month = sep,
number = {arXiv:2309.07172},
eprint = {2309.07172},
primaryclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.2309.07172},
urldate = {2024-03-12},
abstract = {This work investigates the applicability of recent generative Large Language Models (LLMs), such as the GPT series and Flan-T5, to ontology alignment for identifying concept equivalence mappings across ontologies. To test the zero-shot performance of Flan-T5-XXL and GPT-3.5-turbo, we leverage challenging subsets from two equivalence matching datasets of the OAEI Bio-ML track, taking into account concept labels and structural contexts. Preliminary findings suggest that LLMs have the potential to outperform existing ontology alignment systems like BERTMap, given careful framework and prompt design.},
archiveprefix = {arxiv},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
groups = {Ontologies and AI},
timestamp = {2024-03-12T15:15:00Z},
file = {heExploringLargeLanguage2023.pdf:/home/upal/Zotero/storage/SHGN7XCU/heExploringLargeLanguage2023.pdf:application/pdf;arXiv.org Snapshot:/home/upal/Zotero/storage/4K9JWLHU/2309.html:text/html}
}@misc{chowdhury_almost_2023-1,
title = {{ALMOST}: {Adversarial} {Learning} to {Mitigate} {Oracle}-less {ML} {Attacks} via {Synthesis} {Tuning}},
shorttitle = {{ALMOST}},
url = {http://arxiv.org/abs/2303.03372},
abstract = {Oracle-less machine learning (ML) attacks have broken various logic locking schemes. Regular synthesis, which is tailored for area-power-delay optimization, yields netlists where key-gate localities are vulnerable to learning. Thus, we call for security-aware logic synthesis. We propose ALMOST, a framework for adversarial learning to mitigate oracle-less ML attacks via synthesis tuning. ALMOST uses a simulated-annealing-based synthesis recipe generator, employing adversarially trained models that can predict state-of-the-art attacks' accuracies over wide ranges of recipes and key-gate localities. Experiments on ISCAS benchmarks confirm the attacks' accuracies drops to around 50{\textbackslash}\% for ALMOST-synthesized circuits, all while not undermining design optimization.},
urldate = {2023-08-22},
publisher = {arXiv},
author = {Chowdhury, Animesh Basak and Alrahis, Lilas and Collini, Luca and Knechtel, Johann and Karri, Ramesh and Garg, Siddharth and Sinanoglu, Ozgur and Tan, Benjamin},
month = mar,
year = {2023},
note = {arXiv:2303.03372 [cs]},
keywords = {\#broken, Computer Science - Cryptography and Security, Computer Science - Machine Learning, Jab/\#Pre},
}@misc{hurtado_continual_2023,
title = {Continual {Learning} for {Predictive} {Maintenance}: {Overview} and {Challenges}},
shorttitle = {Continual {Learning} for {Predictive} {Maintenance}},
url = {http://arxiv.org/abs/2301.12467},
doi = {10.48550/arXiv.2301.12467},
abstract = {Machine learning techniques have become one of the main propellers for solving many engineering problems effectively and efficiently. In Predictive Maintenance, for instance, Data-Driven methods have been used to improve predictions of when maintenance is needed on different machines and operative contexts. However, one of the limitations of these methods is that they are trained on a fixed distribution that does not change over time, which seldom happens in real-world applications. When internal or external factors alter the data distribution, the model performance may decrease or even fail unpredictably, resulting in severe consequences for machine maintenance. Continual Learning methods propose ways of adapting prediction models and incorporating new knowledge after deployment. The main objective of these methods is to avoid the plasticity-stability dilemma by updating the parametric model while not forgetting previously learned tasks. In this work, we present the current state of the art in applying Continual Learning to Predictive Maintenance, with an extensive review of both disciplines. We first introduce the two research themes independently, then discuss the current intersection of Continual Learning and Predictive Maintenance. Finally, we discuss the main research directions and conclusions.},
urldate = {2023-06-03},
publisher = {arXiv},
author = {Hurtado, Julio and Salvati, Dario and Semola, Rudy and Bosio, Mattia and Lomonaco, Vincenzo},
month = jan,
year = {2023},
note = {arXiv:2301.12467 [cs]},
keywords = {Computer Science - Machine Learning},
}@misc{labib_tailoring_2023,
title = {Tailoring {Adversarial} {Attacks} on {Deep} {Neural} {Networks} for {Targeted} {Class} {Manipulation} {Using} {DeepFool} {Algorithm}},
url = {http://arxiv.org/abs/2310.13019},
abstract = {Deep neural networks (DNNs) have significantly advanced various domains, but their vulnerability to adversarial attacks poses serious concerns. Understanding these vulnerabilities and developing effective defense mechanisms is crucial. DeepFool, an algorithm proposed by Moosavi-Dezfooli et al. (2016), finds minimal perturbations to misclassify input images. However, DeepFool lacks a targeted approach, making it less effective in specific attack scenarios. Also, in previous related works, researchers primarily focus on success, not considering how much an image is getting distorted; the integrity of the image quality, and the confidence level to misclassifying. So, in this paper, we propose Enhanced Targeted DeepFool, an augmented version of DeepFool that allows targeting specific classes for misclassification and also introduce a minimum confidence score requirement hyperparameter to enhance flexibility. Our experiments demonstrate the effectiveness and efficiency of the proposed method across different deep neural network architectures while preserving image integrity as much and perturbation rate as less as possible. By using our approach, the behavior of models can be manipulated arbitrarily using the perturbed images, as we can specify both the target class and the associated confidence score, unlike other DeepFool-derivative works, such as Targeted DeepFool by Gajjar et al. (2022). Results show that one of the deep convolutional neural network architectures, AlexNet, and one of the state-of-the-art model Vision Transformer exhibit high robustness to getting fooled. This approach can have larger implication, as our tuning of confidence level can expose the robustness of image recognition models. Our code will be made public upon acceptance of the paper.},
language = {en},
urldate = {2024-03-12},
publisher = {arXiv},
author = {Labib, S. M. Fazle Rabby and Mondal, Joyanta Jyoti and Manab, Meem Arafat},
month = nov,
year = {2023},
note = {arXiv:2310.13019 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
}@misc{caufieldStructuredPromptInterrogation2023,
title = {Structured Prompt Interrogation and Recursive Extraction of Semantics ({{SPIRES}}): {{A}} Method for Populating Knowledge Bases Using Zero-Shot Learning},
shorttitle = {Structured Prompt Interrogation and Recursive Extraction of Semantics ({{SPIRES}})},
author = {Caufield, J. Harry and Hegde, Harshad and Emonet, Vincent and Harris, Nomi L. and Joachimiak, Marcin P. and Matentzoglu, Nicolas and Kim, HyeongSik and Moxon, Sierra A. T. and Reese, Justin T. and Haendel, Melissa A. and Robinson, Peter N. and Mungall, Christopher J.},
year = {2023},
month = dec,
number = {arXiv:2304.02711},
eprint = {2304.02711},
primaryclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.2304.02711},
urldate = {2024-03-13},
abstract = {Creating knowledge bases and ontologies is a time consuming task that relies on a manual curation. AI/NLP approaches can assist expert curators in populating these knowledge bases, but current approaches rely on extensive training data, and are not able to populate arbitrary complex nested knowledge schemas. Here we present Structured Prompt Interrogation and Recursive Extraction of Semantics (SPIRES), a Knowledge Extraction approach that relies on the ability of Large Language Models (LLMs) to perform zero-shot learning (ZSL) and general-purpose query answering from flexible prompts and return information conforming to a specified schema. Given a detailed, user-defined knowledge schema and an input text, SPIRES recursively performs prompt interrogation against GPT-3+ to obtain a set of responses matching the provided schema. SPIRES uses existing ontologies and vocabularies to provide identifiers for all matched elements. We present examples of use of SPIRES in different domains, including extraction of food recipes, multi-species cellular signaling pathways, disease treatments, multi-step drug mechanisms, and chemical to disease causation graphs. Current SPIRES accuracy is comparable to the mid-range of existing Relation Extraction (RE) methods, but has the advantage of easy customization, flexibility, and, crucially, the ability to perform new tasks in the absence of any training data. This method supports a general strategy of leveraging the language interpreting capabilities of LLMs to assemble knowledge bases, assisting manual knowledge curation and acquisition while supporting validation with publicly-available databases and ontologies external to the LLM. SPIRES is available as part of the open source OntoGPT package: https://github.com/ monarch-initiative/ontogpt.},
archiveprefix = {arxiv},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
groups = {Ontologies and AI},
timestamp = {2024-03-13T12:53:10Z},
file = {caufieldStructuredPromptInterrogation2023.pdf:/home/upal/Zotero/storage/QEH3NQLR/caufieldStructuredPromptInterrogation2023.pdf:application/pdf;arXiv.org Snapshot:/home/upal/Zotero/storage/2JQ9BZ9M/2304.html:text/html}
}@misc{panaganti_robust_2022,
title = {Robust {Reinforcement} {Learning} using {Offline} {Data}},
url = {http://arxiv.org/abs/2208.05129},
abstract = {The goal of robust reinforcement learning (RL) is to learn a policy that is robust against the uncertainty in model parameters. Parameter uncertainty commonly occurs in many real-world RL applications due to simulator modeling errors, changes in the real-world system dynamics over time, and adversarial disturbances. Robust RL is typically formulated as a max-min problem, where the objective is to learn the policy that maximizes the value against the worst possible models that lie in an uncertainty set. In this work, we propose a robust RL algorithm called Robust Fitted Q-Iteration (RFQI), which uses only an offline dataset to learn the optimal robust policy. Robust RL with offline data is significantly more challenging than its non-robust counterpart because of the minimization over all models present in the robust Bellman operator. This poses challenges in offline data collection, optimization over the models, and unbiased estimation. In this work, we propose a systematic approach to overcome these challenges, resulting in our RFQI algorithm. We prove that RFQI learns a near-optimal robust policy under standard assumptions and demonstrate its superior performance on standard benchmark problems.},
language = {en},
urldate = {2023-10-19},
publisher = {arXiv},
author = {Panaganti, Kishan and Xu, Zaiyan and Kalathil, Dileep and Ghavamzadeh, Mohammad},
month = oct,
year = {2022},
note = {arXiv:2208.05129 [cs, stat]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning},
}@misc{liu_malice_2022,
title = {{MALICE}: {Manipulation} {Attacks} on {Learned} {Image} {ComprEssion}},
copyright = {All rights reserved},
shorttitle = {{MALICE}},
url = {http://arxiv.org/abs/2205.13253},
doi = {10.48550/arXiv.2205.13253},
abstract = {Deep learning techniques have shown promising results in image compression, with competitive bitrate and image reconstruction quality from compressed latent. However, while image compression has progressed towards a higher peak signal-to-noise ratio (PSNR) and fewer bits per pixel (bpp), their robustness to adversarial images has never received deliberation. In this work, we, for the first time, investigate the robustness of image compression systems where imperceptible perturbation of input images can precipitate a significant increase in the bitrate of their compressed latent. To characterize the robustness of state-of-the-art learned image compression, we mount white-box and black-box attacks. Our white-box attack employs fast gradient sign method on the entropy estimation of the bitstream as its bitrate approximation. We propose DCT-Net simulating JPEG compression with architectural simplicity and lightweight training as the substitute in the black-box attack and enable fast adversarial transferability. Our results on six image compression models, each with six different bitrate qualities (thirty-six models in total), show that they are surprisingly fragile, where the white-box attack achieves up to 56.326x and black-box 1.947x bpp change. To improve robustness, we propose a novel compression architecture factorAtn which incorporates attention modules and a basic factorized entropy model, resulting in a promising trade-off between the rate-distortion performance and robustness to adversarial attacks that surpasses existing learned image compressors.},
urldate = {2022-08-24},
publisher = {arXiv},
author = {Liu, Kang and Wu, Di and Wang, Yiru and Feng, Dan and Tan, Benjamin and Garg, Siddharth},
month = aug,
year = {2022},
note = {arXiv:2205.13253 [cs]},
keywords = {\#broken, Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Cryptography and Security, Computer Science - Machine Learning, Jab/\#Pre},
}@misc{singh_scirepeval_2022,
title = {{SciRepEval}: {A} {Multi}-{Format} {Benchmark} for {Scientific} {Document} {Representations}},
shorttitle = {{SciRepEval}},
url = {http://arxiv.org/abs/2211.13308},
abstract = {Learned representations of scientific documents can serve as valuable input features for downstream tasks, without the need for further fine-tuning. However, existing benchmarks for evaluating these representations fail to capture the diversity of relevant tasks. In response, we introduce SciRepEval, the first comprehensive benchmark for training and evaluating scientific document representations. It includes 25 challenging and realistic tasks, 11 of which are new, across four formats: classification, regression, ranking and search. We then use the benchmark to study and improve the generalization ability of scientific document representation models. We show how state-of-the-art models struggle to generalize across task formats, and that simple multi-task training fails to improve them. However, a new approach that learns multiple embeddings per document, each tailored to a different format, can improve performance. We experiment with task-format-specific control codes and adapters in a multi-task setting and find that they outperform the existing single-embedding state-of-the-art by up to 1.5 points absolute.},
urldate = {2022-11-28},
publisher = {arXiv},
author = {Singh, Amanpreet and D'Arcy, Mike and Cohan, Arman and Downey, Doug and Feldman, Sergey},
month = nov,
year = {2022},
note = {arXiv:2211.13308 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Information Retrieval, Computer Science - Machine Learning},
}@misc{nijkamp_conversational_2022,
title = {A {Conversational} {Paradigm} for {Program} {Synthesis}},
url = {http://arxiv.org/abs/2203.13474},
abstract = {Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: https://github.com/salesforce/CodeGen.},
urldate = {2022-08-18},
publisher = {arXiv},
author = {Nijkamp, Erik and Pang, Bo and Hayashi, Hiroaki and Tu, Lifu and Wang, Huan and Zhou, Yingbo and Savarese, Silvio and Xiong, Caiming},
month = mar,
year = {2022},
doi = {10.48550/arXiv.2203.13474},
keywords = {\#nosource, Computer Science - Computation and Language, Computer Science - Machine Learning, Computer Science - Programming Languages},
}@misc{poli_introducing_2022,
title = {Introducing topography in convolutional neural networks},
copyright = {All rights reserved},
url = {http://arxiv.org/abs/2211.13152},
doi = {10.48550/arXiv.2211.13152},
abstract = {Parts of the brain that carry sensory tasks are organized topographically: nearby neurons are responsive to the same properties of input signals. Thus, in this work, inspired by the neuroscience literature, we proposed a new topographic inductive bias in Convolutional Neural Networks (CNNs). To achieve this, we introduced a new topographic loss and an efficient implementation to topographically organize each convolutional layer of any CNN. We benchmarked our new method on 4 datasets and 3 models in vision and audio tasks and showed equivalent performance to all benchmarks. Besides, we also showcased the generalizability of our topographic loss with how it can be used with different topographic organizations in CNNs. Finally, we demonstrated that adding the topographic inductive bias made CNNs more resistant to pruning. Our approach provides a new avenue to obtain models that are more memory efficient while maintaining better accuracy.},
urldate = {2023-03-04},
publisher = {arXiv},
author = {Poli, Maxime and Dupoux, Emmanuel and Riad, Rachid},
month = oct,
year = {2022},
note = {arXiv:2211.13152 [cs, eess]},
keywords = {Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
}@misc{van_kirk_hardware-efficient_2022,
title = {Hardware-efficient learning of quantum many-body states},
url = {http://arxiv.org/abs/2212.06084},
abstract = {Efficient characterization of highly entangled multi-particle systems is an outstanding challenge in quantum science. Recent developments have shown that a modest number of randomized measurements suffices to learn many properties of a quantum many-body system. However, implementing such measurements requires complete control over individual particles, which is unavailable in many experimental platforms. In this work, we present rigorous and efficient algorithms for learning quantum many-body states in systems with any degree of control over individual particles, including when every particle is subject to the same global field and no additional ancilla particles are available. We numerically demonstrate the effectiveness of our algorithms for estimating energy densities in a U(1) lattice gauge theory and classifying topological order using very limited measurement capabilities.},
language = {en},
urldate = {2023-06-27},
publisher = {arXiv},
author = {Van Kirk, Katherine and Cotler, Jordan and Huang, Hsin-Yuan and Lukin, Mikhail D.},
month = dec,
year = {2022},
note = {arXiv:2212.06084 [cond-mat, physics:quant-ph]},
keywords = {Quantum Physics, Computer Science - Machine Learning, Condensed Matter - Strongly Correlated Electrons},
annote = {Comment: 7+28 pages, 6 figures},
file = {Van Kirk et al. - 2022 - Hardware-efficient learning of quantum many-body s.pdf:/Users/georgehuang/Zotero/storage/DQIXQFHB/Van Kirk et al. - 2022 - Hardware-efficient learning of quantum many-body s.pdf:application/pdf},
}@misc{kawase_parametric_2022,
title = {Parametric t-{Stochastic} {Neighbor} {Embedding} {With} {Quantum} {Neural} {Network}},
url = {http://arxiv.org/abs/2202.04238},
abstract = {t-Stochastic Neighbor Embedding (t-SNE) is a non-parametric data visualization method in classical machine learning. It maps the data from the high-dimensional space into a low-dimensional space, especially a two-dimensional plane, while maintaining the relationship, or similarities, between the surrounding points. In t-SNE, the initial position of the low-dimensional data is randomly determined, and the visualization is achieved by moving the low-dimensional data to minimize a cost function. Its variant called parametric t-SNE uses neural networks for this mapping. In this paper, we propose to use quantum neural networks for parametric t-SNE to reflect the characteristics of high-dimensional quantum data on low-dimensional data. We use fidelity-based metrics instead of Euclidean distance in calculating high-dimensional data similarity. We visualize both classical (Iris dataset) and quantum (time-depending Hamiltonian dynamics) data for classification tasks. Since this method allows us to represent a quantum dataset in a higher dimensional Hilbert space by a quantum dataset in a lower dimension while keeping their similarity, the proposed method can also be used to compress quantum data for further quantum machine learning.},
language = {en},
urldate = {2023-06-27},
publisher = {arXiv},
author = {Kawase, Yoshiaki and Mitarai, Kosuke and Fujii, Keisuke},
month = feb,
year = {2022},
note = {arXiv:2202.04238 [quant-ph]},
keywords = {Quantum Physics, Computer Science - Machine Learning},
annote = {Comment: 9 pages, 7 figures},
file = {Kawase et al. - 2022 - Parametric t-Stochastic Neighbor Embedding With Qu.pdf:/Users/georgehuang/Zotero/storage/QZA7B2B2/Kawase et al. - 2022 - Parametric t-Stochastic Neighbor Embedding With Qu.pdf:application/pdf},
}@article{kidger_neural_2022,
title = {On {Neural} {Differential} {Equations}},
url = {http://arxiv.org/abs/2202.02435},
abstract = {The conjoining of dynamical systems and deep learning has become a topic of great interest. In particular, neural differential equations (NDEs) demonstrate that neural networks and differential equation are two sides of the same coin. Traditional parameterised differential equations are a special case. Many popular neural network architectures, such as residual networks and recurrent networks, are discretisations. NDEs are suitable for tackling generative problems, dynamical systems, and time series (particularly in physics, finance, ...) and are thus of interest to both modern machine learning and traditional mathematical modelling. NDEs offer high-capacity function approximation, strong priors on model space, the ability to handle irregular data, memory efficiency, and a wealth of available theory on both sides. This doctoral thesis provides an in-depth survey of the field. Topics include: neural ordinary differential equations (e.g. for hybrid neural/mechanistic modelling of physical systems); neural controlled differential equations (e.g. for learning functions of irregular time series); and neural stochastic differential equations (e.g. to produce generative models capable of representing complex stochastic dynamics, or sampling from complex high-dimensional distributions). Further topics include: numerical methods for NDEs (e.g. reversible differential equations solvers, backpropagation through differential equations, Brownian reconstruction); symbolic regression for dynamical systems (e.g. via regularised evolution); and deep implicit models (e.g. deep equilibrium models, differentiable optimisation). We anticipate this thesis will be of interest to anyone interested in the marriage of deep learning with dynamical systems, and hope it will provide a useful reference for the current state of the art.},
language = {en},
urldate = {2022-02-10},
journal = {arXiv:2202.02435 [cs, math, stat]},
author = {Kidger, Patrick},
month = feb,
year = {2022},
note = {arXiv: 2202.02435},
keywords = {/unread, Computer Science - Machine Learning, Mathematics - Classical Analysis and ODEs, Mathematics - Dynamical Systems, Mathematics - Numerical Analysis, Statistics - Machine Learning, ⛔ No DOI found},
}@misc{dery_should_2022,
title = {Should {We} {Be} {Pre}-training? {An} {Argument} for {End}-task {Aware} {Training} as an {Alternative}},
shorttitle = {Should {We} {Be} {Pre}-training?},
url = {http://arxiv.org/abs/2109.07437},
doi = {10.48550/arXiv.2109.07437},
abstract = {In most settings of practical concern, machine learning practitioners know in advance what end-task they wish to boost with auxiliary tasks. However, widely used methods for leveraging auxiliary data like pre-training and its continued-pretraining variant are end-task agnostic: they rarely, if ever, exploit knowledge of the target task. We study replacing end-task agnostic continued training of pre-trained language models with end-task aware training of said models. We argue that for sufficiently important end-tasks, the benefits of leveraging auxiliary data in a task-aware fashion can justify forgoing the traditional approach of obtaining generic, end-task agnostic representations as with (continued) pre-training. On three different low-resource NLP tasks from two domains, we demonstrate that multi-tasking the end-task and auxiliary objectives results in significantly better downstream task performance than the widely-used task-agnostic continued pre-training paradigm of Gururangan et al. (2020). We next introduce an online meta-learning algorithm that learns a set of multi-task weights to better balance among our multiple auxiliary objectives, achieving further improvements on end-task performance and data efficiency.},
urldate = {2023-02-06},
publisher = {arXiv},
author = {Dery, Lucio M. and Michel, Paul and Talwalkar, Ameet and Neubig, Graham},
month = feb,
year = {2022},
note = {arXiv:2109.07437 [cs]},
keywords = {Computer Science - Computation and Language, Computer Science - Machine Learning},
}@misc{chowdhury_too_2022,
title = {Too {Big} to {Fail}? {Active} {Few}-{Shot} {Learning} {Guided} {Logic} {Synthesis}},
shorttitle = {Too {Big} to {Fail}?},
url = {http://arxiv.org/abs/2204.02368},
abstract = {Generating sub-optimal synthesis transformation sequences ("synthesis recipe") is an important problem in logic synthesis. Manually crafted synthesis recipes have poor quality. State-of-the art machine learning (ML) works to generate synthesis recipes do not scale to large netlists as the models need to be trained from scratch, for which training data is collected using time consuming synthesis runs. We propose a new approach, Bulls-Eye, that fine-tunes a pre-trained model on past synthesis data to accurately predict the quality of a synthesis recipe for an unseen netlist. This approach on achieves 2x-10x run-time improvement and better quality-of-result (QoR) than state-of-the-art machine learning approaches.},
urldate = {2022-06-04},
publisher = {arXiv},
author = {Chowdhury, Animesh Basak and Tan, Benjamin and Carey, Ryan and Jain, Tushit and Karri, Ramesh and Garg, Siddharth},
month = apr,
year = {2022},
note = {Number: arXiv:2204.02368
arXiv:2204.02368 [cs]},
keywords = {\#broken, Computer Science - Artificial Intelligence, Computer Science - Hardware Architecture, Computer Science - Machine Learning, Jab/\#Pre},
}@inproceedings{argyle_out_2022,
title = {Out of {One}, {Many}: {Using} {Language} {Models} to {Simulate} {Human} {Samples}},
shorttitle = {Out of {One}, {Many}},
url = {http://arxiv.org/abs/2209.06899},
doi = {10.18653/v1/2022.acl-long.60},
abstract = {We propose and explore the possibility that language models can be studied as effective proxies for specific human sub-populations in social science research. Practical and research applications of artificial intelligence tools have sometimes been limited by problematic biases (such as racism or sexism), which are often treated as uniform properties of the models. We show that the "algorithmic bias" within one such tool -- the GPT-3 language model -- is instead both fine-grained and demographically correlated, meaning that proper conditioning will cause it to accurately emulate response distributions from a wide variety of human subgroups. We term this property "algorithmic fidelity" and explore its extent in GPT-3. We create "silicon samples" by conditioning the model on thousands of socio-demographic backstories from real human participants in multiple large surveys conducted in the United States. We then compare the silicon and human samples to demonstrate that the information contained in GPT-3 goes far beyond surface similarity. It is nuanced, multifaceted, and reflects the complex interplay between ideas, attitudes, and socio-cultural context that characterize human attitudes. We suggest that language models with sufficient algorithmic fidelity thus constitute a novel and powerful tool to advance understanding of humans and society across a variety of disciplines.},
urldate = {2023-08-10},
author = {Argyle, Lisa P. and Busby, Ethan C. and Fulda, Nancy and Gubler, Joshua and Rytting, Christopher and Wingate, David},
year = {2022},
note = {arXiv:2209.06899 [cs]},
keywords = {Computer Science - Computation and Language, Computer Science - Machine Learning},
pages = {819--862},
}@article{villaescusa-navarro_camels_2022,
title = {The {CAMELS} project: public data release},
shorttitle = {The {CAMELS} project},
url = {http://arxiv.org/abs/2201.01300},
abstract = {The Cosmology and Astrophysics with MachinE Learning Simulations (CAMELS) project was developed to combine cosmology with astrophysics through thousands of cosmological hydrodynamic simulations and machine learning. CAMELS contains 4,233 cosmological simulations, 2,049 N-body and 2,184 state-of-the-art hydrodynamic simulations that sample a vast volume in parameter space. In this paper we present the CAMELS public data release, describing the characteristics of the CAMELS simulations and a variety of data products generated from them, including halo, subhalo, galaxy, and void catalogues, power spectra, bispectra, Lyman-\${\textbackslash}alpha\$ spectra, probability distribution functions, halo radial profiles, and X-rays photon lists. We also release over one thousand catalogues that contain billions of galaxies from CAMELS-SAM: a large collection of N-body simulations that have been combined with the Santa Cruz Semi-Analytic Model. We release all the data, comprising more than 350 terabytes and containing 143,922 snapshots, millions of halos, galaxies and summary statistics. We provide further technical details on how to access, download, read, and process the data at {\textbackslash}url\{https://camels.readthedocs.io\}.},
urldate = {2022-01-13},
journal = {arXiv:2201.01300 [astro-ph]},
author = {Villaescusa-Navarro, Francisco and Genel, Shy and Anglés-Alcázar, Daniel and Perez, Lucia A. and Villanueva-Domingo, Pablo and Wadekar, Digvijay and Shao, Helen and Mohammad, Faizan G. and Hassan, Sultan and Moser, Emily and Lau, Erwin T. and Valle, Luis Fernando Machado Poletti and Nicola, Andrina and Thiele, Leander and Jo, Yongseok and Philcox, Oliver H. E. and Oppenheimer, Benjamin D. and Tillman, Megan and Hahn, ChangHoon and Kaushal, Neerav and Pisani, Alice and Gebhardt, Matthew and Delgado, Ana Maria and Caliendo, Joyce and Kreisch, Christina and Wong, Kaze W. K. and Coulton, William R. and Eickenberg, Michael and Parimbelli, Gabriele and Ni, Yueying and Steinwandel, Ulrich P. and La Torre, Valentina and Dave, Romeel and Battaglia, Nicholas and Nagai, Daisuke and Spergel, David N. and Hernquist, Lars and Burkhart, Blakesley and Narayanan, Desika and Wandelt, Benjamin and Somerville, Rachel S. and Bryan, Greg L. and Viel, Matteo and Li, Yin and Irsic, Vid and Kraljic, Katarina and Vogelsberger, Mark},
month = jan,
year = {2022},
note = {arXiv: 2201.01300},
keywords = {Astrophysics - Astrophysics of Galaxies, Astrophysics - Cosmology and Nongalactic Astrophysics, Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Artificial Intelligence, Computer Science - Machine Learning},
}@misc{li_systematic_2021,
title = {A {Systematic} {Collection} of {Medical} {Image} {Datasets} for {Deep} {Learning}},
url = {http://arxiv.org/abs/2106.12864},
abstract = {The astounding success made by artificial intelligence (AI) in healthcare and other fields proves that AI can achieve human-like performance. However, success always comes with challenges. Deep learning algorithms are data-dependent and require large datasets for training. The lack of data in the medical imaging field creates a bottleneck for the application of deep learning to medical image analysis. Medical image acquisition, annotation, and analysis are costly, and their usage is constrained by ethical restrictions. They also require many resources, such as human expertise and funding. That makes it difficult for non-medical researchers to have access to useful and large medical data. Thus, as comprehensive as possible, this paper provides a collection of medical image datasets with their associated challenges for deep learning research. We have collected information of around three hundred datasets and challenges mainly reported between 2013 and 2020 and categorized them into four categories: head \& neck, chest \& abdomen, pathology \& blood, and ``others''. Our paper has three purposes: 1) to provide a most up to date and complete list that can be used as a universal reference to easily find the datasets for clinical image analysis, 2) to guide researchers on the methodology to test and evaluate their methods' performance and robustness on relevant datasets, 3) to provide a ``route'' to relevant algorithms for the relevant medical topics, and challenge leaderboards.},
language = {zh-CN},
urldate = {2023-09-05},
publisher = {arXiv},
author = {Li, Johann and Zhu, Guangming and Hua, Cong and Feng, Mingtao and BasheerBennamoun and Li, Ping and Lu, Xiaoyuan and Song, Juan and Shen, Peiyi and Xu, Xu and Mei, Lin and Zhang, Liang and Shah, Syed Afaq Ali and Bennamoun, Mohammed},
month = jun,
year = {2021},
note = {arXiv:2106.12864 [cs, eess]},
keywords = {/unread, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Electrical Engineering and Systems Science - Image and Video Processing},
}@misc{craswell_overview_2021,
title = {Overview of the {TREC} 2020 deep learning track},
url = {http://arxiv.org/abs/2102.07662},
doi = {10.48550/arXiv.2102.07662},
abstract = {This is the second year of the TREC Deep Learning Track, with the goal of studying ad hoc ranking in the large training data regime. We again have a document retrieval task and a passage retrieval task, each with hundreds of thousands of human-labeled training queries. We evaluate using single-shot TREC-style evaluation, to give us a picture of which ranking methods work best when large data is available, with much more comprehensive relevance labeling on the small number of test queries. This year we have further evidence that rankers with BERT-style pretraining outperform other rankers in the large data regime.},
urldate = {2024-01-05},
author = {Craswell, Nick and Mitra, Bhaskar and Yilmaz, Emine and Campos, Daniel},
month = feb,
year = {2021},
note = {276 citations (Semantic Scholar/arXiv) [2024-01-06]
arXiv:2102.07662 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Information Retrieval, Computer Science - Machine Learning},
}@article{tamkin_dabs_2021,
title = {{DABS}: {A} {Domain}-{Agnostic} {Benchmark} for {Self}-{Supervised} {Learning}},
shorttitle = {{DABS}},
url = {http://arxiv.org/abs/2111.12062},
abstract = {Self-supervised learning algorithms, including BERT and SimCLR, have enabled significant strides in fields like natural language processing, computer vision, and speech processing. However, these algorithms are domain-specific, meaning that new self-supervised learning algorithms must be developed for each new setting, including myriad healthcare, scientific, and multimodal domains. To catalyze progress toward domain-agnostic methods, we introduce DABS: a DomainAgnostic Benchmark for Self-supervised learning. To perform well on DABS, an algorithm is evaluated on seven diverse domains: natural images, multichannel sensor data, English text, speech recordings, multilingual text, chest x-rays, and images with text descriptions. Each domain contains an unlabeled dataset for pretraining; the model is then is scored based on its downstream performance on a set of labeled tasks in the domain. We also present e-Mix and ShED: two baseline domain-agnostic algorithms; their relatively modest performance demonstrates that significant progress is needed before self-supervised learning is an out-of-thebox solution for arbitrary domains. Code for benchmark datasets and baseline algorithms is available at https://github.com/alextamkin/dabs.},
language = {en},
urldate = {2021-12-08},
journal = {arXiv:2111.12062 [cs]},
author = {Tamkin, Alex and Liu, Vincent and Lu, Rongfei and Fein, Daniel and Schultz, Colin and Goodman, Noah},
month = nov,
year = {2021},
note = {arXiv: 2111.12062},
keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
}@misc{beucler_climate-invariant_2021,
title = {Climate-{Invariant} {Machine} {Learning}},
url = {http://arxiv.org/abs/2112.08440},
abstract = {Data-driven algorithms, in particular neural networks, can emulate the effects of unresolved processes in coarse-resolution climate models when trained on high-resolution simulation data; however, they often make large generalization errors when evaluated in conditions they were not trained on. Here, we propose to physically rescale the inputs and outputs of machine learning algorithms to help them generalize to unseen climates. Applied to offline parameterizations of subgrid-scale thermodynamics in three distinct climate models, we show that rescaled or "climate-invariant" neural networks make accurate predictions in test climates that are 4K and 8K warmer than their training climates. Additionally, "climate-invariant" neural nets facilitate generalization between Aquaplanet and Earth-like simulations. Through visualization and attribution methods, we show that compared to standard machine learning models, "climate-invariant" algorithms learn more local and robust relations between storm-scale convection, radiation, and their synoptic thermodynamic environment. Overall, these results suggest that explicitly incorporating physical knowledge into data-driven models of Earth system processes can improve their consistency and ability to generalize across climate regimes.},
urldate = {2023-04-18},
publisher = {arXiv},
author = {Beucler, Tom and Pritchard, Michael and Yuval, Janni and Gupta, Ankitesh and Peng, Liran and Rasp, Stephan and Ahmed, Fiaz and O'Gorman, Paul A. and Neelin, J. David and Lutsko, Nicholas J. and Gentine, Pierre},
month = dec,
year = {2021},
note = {arXiv:2112.08440 [physics]},
keywords = {Computer Science - Machine Learning, Physics - Atmospheric and Oceanic Physics, Physics - Computational Physics},
}@article{ghojogh_kkt_2021,
title = {{KKT} {Conditions}, {First}-{Order} and {Second}-{Order} {Optimization}, and {Distributed} {Optimization}: {Tutorial} and {Survey}},
shorttitle = {{KKT} {Conditions}, {First}-{Order} and {Second}-{Order} {Optimization}, and {Distributed} {Optimization}},
url = {http://arxiv.org/abs/2110.01858},
abstract = {This is a tutorial and survey paper on Karush-Kuhn-Tucker (KKT) conditions, first-order and second-order numerical optimization, and distributed optimization. After a brief review of history of optimization, we start with some preliminaries on properties of sets, norms, functions, and concepts of optimization. Then, we introduce the optimization problem, standard optimization problems (including linear programming, quadratic programming, and semidefinite programming), and convex problems. We also introduce some techniques such as eliminating inequality, equality, and set constraints, adding slack variables, and epigraph form. We introduce Lagrangian function, dual variables, KKT conditions (including primal feasibility, dual feasibility, weak and strong duality, complementary slackness, and stationarity condition), and solving optimization by method of Lagrange multipliers. Then, we cover first-order optimization including gradient descent, line-search, convergence of gradient methods, momentum, steepest descent, and backpropagation. Other first-order methods are explained, such as accelerated gradient method, stochastic gradient descent, mini-batch gradient descent, stochastic average gradient, stochastic variance reduced gradient, AdaGrad, RMSProp, and Adam optimizer, proximal methods (including proximal mapping, proximal point algorithm, and proximal gradient method), and constrained gradient methods (including projected gradient method, projection onto convex sets, and Frank-Wolfe method). We also cover non-smooth and \${\textbackslash}ell\_1\$ optimization methods including lasso regularization, convex conjugate, Huber function, soft-thresholding, coordinate descent, and subgradient methods. Then, we explain second-order methods including Newton's method for unconstrained, equality constrained, and inequality constrained problems....},
urldate = {2022-02-14},
journal = {arXiv:2110.01858 [cs, math]},
author = {Ghojogh, Benyamin and Ghodsi, Ali and Karray, Fakhri and Crowley, Mark},
month = oct,
year = {2021},
note = {5 citations (Semantic Scholar/arXiv) [2022-07-13]
arXiv: 2110.01858},
keywords = {Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Machine Learning, Mathematics - Numerical Analysis, Mathematics - Optimization and Control, ⛔ No DOI found},
}@inproceedings{lyu_styleptb_2021,
title = {{StylePTB}: {A} {Compositional} {Benchmark} for {Fine}-grained {Controllable} {Text} {Style} {Transfer}},
shorttitle = {{StylePTB}},
url = {http://arxiv.org/abs/2104.05196},
abstract = {Text style transfer aims to controllably generate text with targeted stylistic changes while maintaining core meaning from the source sentence constant. Many of the existing style transfer benchmarks primarily focus on individual high-level semantic changes (e.g. positive to negative), which enable controllability at a high level but do not offer fine-grained control involving sentence structure, emphasis, and content of the sentence. In this paper, we introduce a large-scale benchmark, StylePTB, with (1) paired sentences undergoing 21 fine-grained stylistic changes spanning atomic lexical, syntactic, semantic, and thematic transfers of text, as well as (2) compositions of multiple transfers which allow modeling of fine-grained stylistic changes as building blocks for more complex, high-level transfers. By benchmarking existing methods on StylePTB, we find that they struggle to model fine-grained changes and have an even more difficult time composing multiple styles. As a result, StylePTB brings novel challenges that we hope will encourage future research in controllable text style transfer, compositional models, and learning disentangled representations. Solving these challenges would present important steps towards controllable text generation.},
urldate = {2021-05-07},
booktitle = {{NAACL}},
publisher = {Association for Computational Linguistics},
author = {Lyu, Yiwei and Liang, Paul Pu and Pham, Hai and Hovy, Eduard and Póczos, Barnabás and Salakhutdinov, Ruslan and Morency, Louis-Philippe},
month = apr,
year = {2021},
note = {arXiv: 2104.05196},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Machine Learning},
}@article{blondel_fast_2020,
title = {Fast {Differentiable} {Sorting} and {Ranking}},
url = {http://arxiv.org/abs/2002.08871},
abstract = {The sorting operation is one of the most commonly used building blocks in computer programming. In machine learning, it is often used for robust statistics. However, seen as a function, it is piecewise linear and as a result includes many kinks where it is non-differentiable. More problematic is the related ranking operator, often used for order statistics and ranking metrics. It is a piecewise constant function, meaning that its derivatives are null or undefined. While numerous works have proposed differentiable proxies to sorting and ranking, they do not achieve the \$O(n {\textbackslash}log n)\$ time complexity one would expect from sorting and ranking operations. In this paper, we propose the first differentiable sorting and ranking operators with \$O(n {\textbackslash}log n)\$ time and \$O(n)\$ space complexity. Our proposal in addition enjoys exact computation and differentiation. We achieve this feat by constructing differentiable operators as projections onto the permutahedron, the convex hull of permutations, and using a reduction to isotonic optimization. Empirically, we confirm that our approach is an order of magnitude faster than existing approaches and showcase two novel applications: differentiable Spearman's rank correlation coefficient and least trimmed squares.},
urldate = {2022-02-14},
journal = {arXiv:2002.08871 [cs, stat]},
author = {Blondel, Mathieu and Teboul, Olivier and Berthet, Quentin and Djolonga, Josip},
month = jun,
year = {2020},
note = {arXiv: 2002.08871},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{ullmo_encoding_2020,
title = {Encoding large scale cosmological structure with {Generative} {Adversarial} {Networks}},
volume = {2011},
url = {http://adsabs.harvard.edu/abs/2020arXiv201105244U},
abstract = {Recently a type of neural networks called Generative Adversarial
Networks (GANs) has been proposed as a solution for fast generation of
simulation-like datasets, in an attempt to bypass heavy computations and
expensive cosmological simulations to run in terms of time and computing
power. In the present work, we build and train a GAN to look further
into the strengths and limitations of such an approach. We then propose
a novel method in which we make use of a trained GAN to construct a
simple autoencoder (AE) as a first step towards building a predictive
model. Both the GAN and AE are trained on images issued from two types
of N-body simulations, namely 2D and 3D simulations. We find that the
GAN successfully generates new images that are statistically consistent
with the images it was trained on. We then show that the AE manages to
efficiently extract information from simulation images, satisfyingly
inferring the latent encoding of the GAN to generate an image with
similar large scale structures.},
urldate = {2020-11-13},
journal = {arXiv e-prints},
author = {Ullmo, Marion and Decelle, Aurélien and Aghanim, Nabila},
month = nov,
year = {2020},
keywords = {Astrophysics - Cosmology and Nongalactic Astrophysics, Computer Science - Machine Learning},
pages = {arXiv:2011.05244},
}@article{friederich_scientific_2020,
title = {Scientific intuition inspired by machine learning generated hypotheses},
url = {http://arxiv.org/abs/2010.14236},
abstract = {Machine learning with application to questions in the physical sciences has become a widely used tool, successfully applied to classification, regression and optimization tasks in many areas. Research focus mostly lies in improving the accuracy of the machine learning models in numerical predictions, while scientific understanding is still almost exclusively generated by human researchers analysing numerical results and drawing conclusions. In this work, we shift the focus on the insights and the knowledge obtained by the machine learning models themselves. In particular, we study how it can be extracted and used to inspire human scientists to increase their intuitions and understanding of natural systems. We apply gradient boosting in decision trees to extract human interpretable insights from big data sets from chemistry and physics. In chemistry, we not only rediscover widely know rules of thumb but also find new interesting motifs that tell us how to control solubility and energy levels of organic molecules. At the same time, in quantum physics, we gain new understanding on experiments for quantum entanglement. The ability to go beyond numerics and to enter the realm of scientific insight and hypothesis generation opens the door to use machine learning to accelerate the discovery of conceptual understanding in some of the most challenging domains of science.},
urldate = {2020-11-13},
journal = {arXiv},
author = {Friederich, Pascal and Krenn, Mario and Tamblyn, Isaac and Aspuru-Guzik, Alan},
month = oct,
year = {2020},
note = {arXiv: 2010.14236},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computational Engineering, Finance, and Science, Computer Science - Machine Learning, Physics - Chemical Physics, Quantum Physics},
}@article{e_machine_2020,
title = {Machine learning and computational mathematics},
volume = {28},
issn = {1815-2406, 1991-7120},
url = {http://arxiv.org/abs/2009.14596},
doi = {10.4208/cicp.OA-2020-0185},
abstract = {Neural network-based machine learning is capable of approximating functions in very high dimension with unprecedented efficiency and accuracy. This has opened up many exciting new possibilities, not just in traditional areas of artificial intelligence, but also in scientific computing and computational science. At the same time, machine learning has also acquired the reputation of being a set of "black box" type of tricks, without fundamental principles. This has been a real obstacle for making further progress in machine learning. In this article, we try to address the following two very important questions: (1) How machine learning has already impacted and will further impact computational mathematics, scientific computing and computational science? (2) How computational mathematics, particularly numerical analysis, \{can\} impact machine learning? We describe some of the most important progress that has been made on these issues. Our hope is to put things into a perspective that will help to integrate machine learning with computational mathematics.},
language = {en},
number = {5},
urldate = {2024-01-10},
journal = {Communications in Computational Physics},
author = {E, Weinan},
month = jun,
year = {2020},
note = {arXiv:2009.14596 [cs, math, stat]},
keywords = {68T07, 46E15, 26B35, 26B40, Computer Science - Machine Learning, Mathematics - Numerical Analysis, Statistics - Machine Learning},
pages = {1639--1670},
}@misc{raffel_exploring_2020,
title = {Exploring the {Limits} of {Transfer} {Learning} with a {Unified} {Text}-to-{Text} {Transformer}},
url = {http://arxiv.org/abs/1910.10683},
doi = {10.48550/arXiv.1910.10683},
abstract = {Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new ``Colossal Clean Crawled Corpus'', we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code.},
urldate = {2023-02-13},
publisher = {arXiv},
author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
month = jul,
year = {2020},
note = {arXiv:1910.10683 [cs, stat]},
keywords = {Computer Science - Computation and Language, Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{engstrom_implementation_2020,
title = {Implementation {Matters} in {Deep} {Policy} {Gradients}: {A} {Case} {Study} on {PPO} and {TRPO}},
shorttitle = {Implementation {Matters} in {Deep} {Policy} {Gradients}},
url = {http://arxiv.org/abs/2005.12729},
abstract = {We study the roots of algorithmic progress in deep policy gradient algorithms through a case study on two popular algorithms: Proximal Policy Optimization (PPO) and Trust Region Policy Optimization (TRPO). Specifically, we investigate the consequences of "code-level optimizations:" algorithm augmentations found only in implementations or described as auxiliary details to the core algorithm. Seemingly of secondary importance, such optimizations turn out to have a major impact on agent behavior. Our results show that they (a) are responsible for most of PPO's gain in cumulative reward over TRPO, and (b) fundamentally change how RL methods function. These insights show the difficulty and importance of attributing performance gains in deep reinforcement learning. Code for reproducing our results is available at https://github.com/MadryLab/implementation-matters .},
urldate = {2022-03-30},
journal = {arXiv:2005.12729 [cs, stat]},
author = {Engstrom, Logan and Ilyas, Andrew and Santurkar, Shibani and Tsipras, Dimitris and Janoos, Firdaus and Rudolph, Larry and Madry, Aleksander},
month = may,
year = {2020},
note = {arXiv: 2005.12729},
keywords = {Computer Science - Machine Learning, Computer Science - Robotics, Statistics - Machine Learning},
}@article{bousquet_theory_2020,
title = {A {Theory} of {Universal} {Learning}},
url = {http://arxiv.org/abs/2011.04483},
abstract = {How quickly can a given class of concepts be learned from examples? It is common to measure the performance of a supervised machine learning algorithm by plotting its "learning curve", that is, the decay of the error rate as a function of the number of training examples. However, the classical theoretical framework for understanding learnability, the PAC model of Vapnik-Chervonenkis and Valiant, does not explain the behavior of learning curves: the distribution-free PAC model of learning can only bound the upper envelope of the learning curves over all possible data distributions. This does not match the practice of machine learning, where the data source is typically fixed in any given scenario, while the learner may choose the number of training examples on the basis of factors such as computational resources and desired accuracy. In this paper, we study an alternative learning model that better captures such practical aspects of machine learning, but still gives rise to a complete theory of the learnable in the spirit of the PAC model. More precisely, we consider the problem of universal learning, which aims to understand the performance of learning algorithms on every data distribution, but without requiring uniformity over the distribution. The main result of this paper is a remarkable trichotomy: there are only three possible rates of universal learning. More precisely, we show that the learning curves of any given concept class decay either at an exponential, linear, or arbitrarily slow rates. Moreover, each of these cases is completely characterized by appropriate combinatorial parameters, and we exhibit optimal learning algorithms that achieve the best possible rate in each case. For concreteness, we consider in this paper only the realizable case, though analogous results are expected to extend to more general learning scenarios.},
urldate = {2022-02-14},
journal = {arXiv:2011.04483 [cs, math, stat]},
author = {Bousquet, Olivier and Hanneke, Steve and Moran, Shay and van Handel, Ramon and Yehudayoff, Amir},
month = nov,
year = {2020},
note = {arXiv: 2011.04483},
keywords = {Computer Science - Data Structures and Algorithms, Computer Science - Machine Learning, Mathematics - Statistics Theory, Statistics - Machine Learning},
}@article{antil_bilevel_2020,
title = {Bilevel {Optimization}, {Deep} {Learning} and {Fractional} {Laplacian} {Regularization} with {Applications} in {Tomography}},
volume = {36},
issn = {0266-5611, 1361-6420},
url = {http://arxiv.org/abs/1907.09605},
doi = {10.1088/1361-6420/ab80d7},
abstract = {In this work we consider a generalized bilevel optimization framework for solving inverse problems. We introduce fractional Laplacian as a regularizer to improve the reconstruction quality, and compare it with the total variation regularization. We emphasize that the key advantage of using fractional Laplacian as a regularizer is that it leads to a linear operator, as opposed to the total variation regularization which results in a nonlinear degenerate operator. Inspired by residual neural networks, to learn the optimal strength of regularization and the exponent of fractional Laplacian, we develop a dedicated bilevel optimization neural network with a variable depth for a general regularized inverse problem. We also draw some parallels between an activation function in a neural network and regularization. We illustrate how to incorporate various regularizer choices into our proposed network. As an example, we consider tomographic reconstruction as a model problem and show an improvement in reconstruction quality, especially for limited data, via fractional Laplacian regularization. We successfully learn the regularization strength and the fractional exponent via our proposed bilevel optimization neural network. We observe that the fractional Laplacian regularization outperforms total variation regularization. This is specially encouraging, and important, in the case of limited and noisy data.},
number = {6},
urldate = {2022-02-14},
journal = {Inverse Problems},
author = {Antil, Harbir and Di, Zichao and Khatri, Ratna},
month = jun,
year = {2020},
note = {arXiv: 1907.09605},
keywords = {/unread, 65D18, 68U10, 62H35, 94A08, 35R11, 34K37, 65K10, Computer Science - Machine Learning, Electrical Engineering and Systems Science - Image and Video Processing, Mathematics - Numerical Analysis, Mathematics - Optimization and Control},
pages = {064001},
}@article{huang_deeppurpose_2020,
title = {{DeepPurpose}: a {Deep} {Learning} {Based} {Drug} {Repurposing} {Toolkit}},
shorttitle = {{DeepPurpose}},
url = {http://arxiv.org/abs/2004.08919},
abstract = {We present DeepPurpose, a deep learning toolkit for simple and efficient drug repurposing. With a few lines of code, DeepPurpose generates drug candidates based on aggregating five pretrained state-of-the-art models while offering flexibility for users to train their own models with 15 drug/target encodings and \$50+\$ novel architectures. We demonstrated DeepPurpose using case studies, including repurposing for COVID-19 where promising candidates under trials are ranked high in our results.},
urldate = {2020-06-30},
journal = {arXiv:2004.08919 [cs, q-bio, stat]},
author = {Huang, Kexin and Fu, Tianfan and Xiao, Cao and Glass, Lucas and Sun, Jimeng},
month = apr,
year = {2020},
note = {arXiv: 2004.08919},
keywords = {Computer Science - Machine Learning, Quantitative Biology - Quantitative Methods, Statistics - Machine Learning},
}@article{huang_generative_2019,
title = {Generative {Adversarial} {Privacy}},
url = {http://arxiv.org/abs/1807.05306},
abstract = {We present a data-driven framework called generative adversarial privacy (GAP). Inspired by recent advancements in generative adversarial networks (GANs), GAP allows the data holder to learn the privatization mechanism directly from the data. Under GAP, finding the optimal privacy mechanism is formulated as a constrained minimax game between a privatizer and an adversary. We show that for appropriately chosen adversarial loss functions, GAP provides privacy guarantees against strong information-theoretic adversaries. We also evaluate GAP's performance on the GENKI face database.},
urldate = {2020-09-08},
journal = {arXiv:1807.05306 [cs, math, stat]},
author = {Huang, Chong and Kairouz, Peter and Chen, Xiao and Sankar, Lalitha and Rajagopal, Ram},
month = jun,
year = {2019},
note = {arXiv: 1807.05306},
keywords = {\#broken, Computer Science - Computer Science and Game Theory, Computer Science - Cryptography and Security, Computer Science - Information Theory, Computer Science - Machine Learning, Jab/\#Pre, Statistics - Machine Learning, ⛔ No DOI found},
}@article{tavakoli_prioritizing_2019,
title = {Prioritizing {Starting} {States} for {Reinforcement} {Learning}},
url = {http://arxiv.org/abs/1811.11298},
abstract = {Online, off-policy reinforcement learning algorithms are able to use an experience memory to remember and replay past experiences. In prior work, this approach was used to stabilize training by breaking the temporal correlations of the updates and avoiding the rapid forgetting of possibly rare experiences. In this work, we propose a conceptually simple framework that uses an experience memory to help exploration by prioritizing the starting states from which the agent starts acting in the environment, importantly, in a fashion that is also compatible with on-policy algorithms. Given the capacity to restart the agent in states corresponding to its past observations, we achieve this objective by (i) enabling the agent to restart in states belonging to significant past experiences (e.g., nearby goals), and (ii) promoting faster coverage of the state space through starting from a more diverse set of states. While, using a good priority measure to identify significant past transitions, we expect case (i) to more considerably help exploration in certain domains (e.g., sparse reward tasks), we hypothesize that case (ii) will generally be beneficial, even without any prioritization. We show empirically that our approach improves learning performance for both off-policy and on-policy deep reinforcement learning methods, with most notable gains in highly sparse reward tasks.},
urldate = {2020-03-06},
journal = {arXiv:1811.11298 [cs, stat]},
author = {Tavakoli, Arash and Levdik, Vitaly and Islam, Riashat and Kormushev, Petar},
month = jan,
year = {2019},
note = {arXiv: 1811.11298},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{gulian_machine_2019,
title = {Machine {Learning} of {Space}-{Fractional} {Differential} {Equations}},
url = {http://arxiv.org/abs/1808.00931},
abstract = {Data-driven discovery of “hidden physics” – i.e., machine learning of differential equation models underlying observed data – has recently been approached by embedding the discovery problem into a Gaussian Process regression of spatial data, treating and discovering unknown equation parameters as hyperparameters of a “physics informed” Gaussian Process kernel. This kernel includes the parametrized differential operators applied to a prior covariance kernel. We extend this framework to the data-driven discovery of linear space-fractional differential equations. The methodology is compatible with a wide variety of space-fractional operators in Rd and stationary covariance kernels, including the Mat´ern class, and allows for optimizing the Mat´ern parameter during training. Since fractional derivatives are typically not given by closed-form analytic expressions, the main challenges to be addressed are a user-friendly, general way to set up fractional-order derivatives of covariance kernels, together with feasible and robust numerical methods for such implementations. Making use of the simple Fourier-space representation of space-fractional derivatives in Rd, we provide a unified set of integral formulas for the resulting Gaussian Process kernels. The shift property of the Fourier transform results in formulas involving d-dimensional integrals that can be efficiently treated using generalized Gauss-Laguerre quadrature.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1808.00931 [cs, stat]},
author = {Gulian, Mamikon and Raissi, Maziar and Perdikaris, Paris and Karniadakis, George},
month = aug,
year = {2019},
note = {39 citations (Semantic Scholar/arXiv) [2023-02-27]
arXiv: 1808.00931},
keywords = {/unread, 35R11, 65N21, 62M10, 62F15, 60G15, 60G52, Computer Science - Machine Learning, Statistics - Machine Learning, ⛔ No DOI found},
}@article{devapujula_learning_2019,
title = {Learning to {Rank} {Broad} and {Narrow} {Queries} in {E}-{Commerce}},
url = {http://arxiv.org/abs/1907.01549},
abstract = {Search is a prominent channel for discovering products on an e-commerce platform. Ranking products retrieved from search becomes crucial to address customer's need and optimize for business metrics. While learning to Rank (LETOR) models have been extensively studied and have demonstrated efficacy in the context of web search; it is a relatively new research area to be explored in the e-commerce. In this paper, we present a framework for building LETOR model for an e-commerce platform. We analyze user queries and propose a mechanism to segment queries between broad and narrow based on user's intent. We discuss different types of features - query, product and query-product and discuss challenges in using them. We show that sparsity in product features can be tackled through a denoising auto-encoder while skip-gram based word embeddings help solve the query-product sparsity issues. We also present various target metrics that can be employed for evaluating search results and compare their robustness. Further, we build and compare performances of both pointwise and pairwise LETOR models on fashion category data set. We also build and compare distinct models for broad and narrow queries, analyze feature importance across these and show that these specialized models perform better than a combined model in the fashion world.},
urldate = {2020-04-08},
journal = {arXiv:1907.01549 [cs, stat]},
author = {Devapujula, Siddhartha and Arora, Sagar and Borar, Sumit},
month = jul,
year = {2019},
note = {arXiv: 1907.01549},
keywords = {Computer Science - Computation and Language, Computer Science - Information Retrieval, Computer Science - Machine Learning, Statistics - Machine Learning}
}@article{li_learning_2019,
title = {Learning {Overparameterized} {Neural} {Networks} via {Stochastic} {Gradient} {Descent} on {Structured} {Data}},
url = {http://arxiv.org/abs/1808.01204},
abstract = {Neural networks have many successful applications, while much less theoretical understanding has been gained. Towards bridging this gap, we study the problem of learning a two-layer overparameterized ReLU neural network for multi-class classification via stochastic gradient descent (SGD) from random initialization. In the overparameterized setting, when the data comes from mixtures of well-separated distributions, we prove that SGD learns a network with a small generalization error, albeit the network has enough capacity to fit arbitrary labels. Furthermore, the analysis provides interesting insights into several aspects of learning neural networks and can be verified based on empirical studies on synthetic data and on the MNIST dataset.},
urldate = {2022-03-02},
journal = {arXiv:1808.01204 [cs, stat]},
author = {Li, Yuanzhi and Liang, Yingyu},
month = aug,
year = {2019},
note = {arXiv: 1808.01204},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{rahimian_distributionally_2019,
title = {Distributionally {Robust} {Optimization}: {A} {Review}},
shorttitle = {Distributionally {Robust} {Optimization}},
url = {http://arxiv.org/abs/1908.05659},
abstract = {The concepts of risk-aversion, chance-constrained optimization, and robust optimization have developed significantly over the last decade. Statistical learning community has also witnessed a rapid theoretical and applied growth by relying on these concepts. A modeling framework, called distributionally robust optimization (DRO), has recently received significant attention in both the operations research and statistical learning communities. This paper surveys main concepts and contributions to DRO, and its relationships with robust optimization, risk-aversion, chance-constrained optimization, and function regularization.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1908.05659 [cs, math, stat]},
author = {Rahimian, Hamed and Mehrotra, Sanjay},
month = aug,
year = {2019},
note = {arXiv: 1908.05659},
keywords = {/unread, Computer Science - Machine Learning, Mathematics - Optimization and Control, NSFC, Statistics - Machine Learning, ⛔ No DOI found},
}@article{zamudio-fernandez_higan:_2019,
title = {{HIGAN}: {Cosmic} {Neutral} {Hydrogen} with {Generative} {Adversarial} {Networks}},
volume = {1904},
shorttitle = {{HIGAN}},
url = {http://adsabs.harvard.edu/abs/2019arXiv190412846Z},
abstract = {One of the most promising ways to observe the Universe is by detecting the 21cm emission from cosmic neutral hydrogen (HI) through
radio-telescopes. Those observations can shed light on fundamental astrophysical questions only if accurate theoretical predictions are available. In order to maximize the scientific return of these surveys, those predictions need to include different observables and be precise on non-linear scales. Currently, one of the best ways to achieve this is via cosmological hydrodynamic simulations; however, the computational cost of these simulations is high -- tens of millions of CPU hours. In this work, we use Wasserstein Generative Adversarial Networks (WGANs) to generate new high-resolution (\$35{\textasciitilde}h{\textasciicircum}\{-1\}\{{\textbackslash}rm kpc\}\$) 3D realizations of cosmic HI at \$z=5\$. We do so by sampling from a 100-dimension manifold, learned by the generator, that characterizes the fully non-linear abundance and clustering of cosmic HI from the state-of-the-art
simulation IllustrisTNG. We show that different statistical properties of the produced samples -- 1D PDF, power spectrum, bispectrum, and void size function -- match very well those of IllustrisTNG, and outperform state-of-the-art models such as Halo Occupation Distributions (HODs). Our WGAN samples reproduce the abundance of HI across 9 orders of magnitude, from the Ly\${\textbackslash}alpha\$ forest to Damped Lyman Absorbers. WGAN can produce new samples orders of magnitude faster than hydrodynamic simulations.},
urldate = {2019-04-30},
journal = {arXiv e-prints},
author = {Zamudio-Fernandez, Juan and Okan, Atakan and Villaescusa-Navarro, Francisco and Bilaloglu, Seda and Derin Cengiz, Asena and He, Siyu and Perreault Levasseur, Laurence and Ho, Shirley},
month = apr,
year = {2019},
keywords = {Astrophysics - Cosmology and Nongalactic Astrophysics, Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Machine Learning},
pages = {arXiv:1904.12846},
}@article{rahaman_spectral_2019,
title = {On the {Spectral} {Bias} of {Neural} {Networks}},
url = {http://arxiv.org/abs/1806.08734},
abstract = {Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with \$100{\textbackslash}\%\$ accuracy. In this work, we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we show that deep ReLU networks are biased towards low frequency functions, meaning that they cannot have local fluctuations without affecting their global behavior. Intuitively, this property is in line with the observation that over-parameterized networks find simple patterns that generalize across data samples. We also investigate how the shape of the data manifold affects expressivity by showing evidence that learning high frequencies gets {\textbackslash}emph\{easier\} with increasing manifold complexity, and present a theoretical understanding of this behavior. Finally, we study the robustness of the frequency components with respect to parameter perturbation, to develop the intuition that the parameters must be finely tuned to express high frequency functions.},
urldate = {2022-03-02},
journal = {arXiv:1806.08734 [cs, stat]},
author = {Rahaman, Nasim and Baratin, Aristide and Arpit, Devansh and Draxler, Felix and Lin, Min and Hamprecht, Fred A. and Bengio, Yoshua and Courville, Aaron},
month = may,
year = {2019},
note = {arXiv: 1806.08734},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{lillicrap_continuous_2019,
title = {Continuous control with deep reinforcement learning},
url = {http://arxiv.org/abs/1509.02971},
abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to find policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies end-to-end: directly from raw pixel inputs.},
urldate = {2022-02-22},
journal = {arXiv:1509.02971 [cs, stat]},
author = {Lillicrap, Timothy P. and Hunt, Jonathan J. and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
month = jul,
year = {2019},
note = {arXiv: 1509.02971},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@misc{carlini_evaluating_2019,
title = {On {Evaluating} {Adversarial} {Robustness}},
url = {http://arxiv.org/abs/1902.06705},
abstract = {Correctly evaluating defenses against adversarial examples has proven to be extremely difficult. Despite the significant amount of recent work attempting to design defenses that withstand adaptive attacks, few have succeeded; most papers that propose defenses are quickly shown to be incorrect.},
language = {en},
urldate = {2024-06-18},
publisher = {arXiv},
author = {Carlini, Nicholas and Athalye, Anish and Papernot, Nicolas and Brendel, Wieland and Rauber, Jonas and Tsipras, Dimitris and Goodfellow, Ian and Madry, Aleksander and Kurakin, Alexey},
month = feb,
year = {2019},
note = {arXiv:1902.06705 [cs, stat]},
keywords = {Computer Science - Cryptography and Security, Computer Science - Machine Learning, Jab/\#Pre, Statistics - Machine Learning},
}@misc{bellemare_distributional_2019,
title = {Distributional reinforcement learning with linear function approximation},
url = {http://arxiv.org/abs/1902.03149},
abstract = {Despite many algorithmic advances, our theoretical understanding of practical distributional reinforcement learning methods remains limited. One exception is Rowland et al. (2018)’s analysis of the C51 algorithm in terms of the Cram´er distance, but their results only apply to the tabular setting and ignore C51’s use of a softmax to produce normalized distributions. In this paper we adapt the Crame´r distance to deal with arbitrary vectors. From it we derive a new distributional algorithm which is fully Cram´erbased and can be combined to linear function approximation, with formal guarantees in the context of policy evaluation. In allowing the model’s prediction to be any real vector, we lose the probabilistic interpretation behind the method, but otherwise maintain the appealing properties of distributional approaches. To the best of our knowledge, ours is the first proof of convergence of a distributional algorithm combined with function approximation. Perhaps surprisingly, our results provide evidence that Crame´r-based distributional methods may perform worse than directly approximating the value function.},
language = {en},
urldate = {2023-10-13},
publisher = {arXiv},
author = {Bellemare, Marc G. and Roux, Nicolas Le and Castro, Pablo Samuel and Moitra, Subhodeep},
month = feb,
year = {2019},
note = {arXiv:1902.03149 [cs, stat]},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@misc{allen-zhu_convergence_2019,
title = {A {Convergence} {Theory} for {Deep} {Learning} via {Over}-{Parameterization}},
url = {http://arxiv.org/abs/1811.03962},
doi = {10.48550/arXiv.1811.03962},
abstract = {Deep neural networks (DNNs) have demonstrated dominating performance in many fields; since AlexNet, networks used in practice are going wider and deeper. On the theoretical side, a long line of works has been focusing on training neural networks with one hidden layer. The theory of multi-layer networks remains largely unsettled. In this work, we prove why stochastic gradient descent (SGD) can find \${\textbackslash}textit\{global minima\}\$ on the training objective of DNNs in \${\textbackslash}textit\{polynomial time\}\$. We only make two assumptions: the inputs are non-degenerate and the network is over-parameterized. The latter means the network width is sufficiently large: \${\textbackslash}textit\{polynomial\}\$ in \$L\$, the number of layers and in \$n\$, the number of samples. Our key technique is to derive that, in a sufficiently large neighborhood of the random initialization, the optimization landscape is almost-convex and semi-smooth even with ReLU activations. This implies an equivalence between over-parameterized neural networks and neural tangent kernel (NTK) in the finite (and polynomial) width setting. As concrete examples, starting from randomly initialized weights, we prove that SGD can attain 100\% training accuracy in classification tasks, or minimize regression loss in linear convergence speed, with running time polynomial in \$n,L\$. Our theory applies to the widely-used but non-smooth ReLU activation, and to any smooth and possibly non-convex loss functions. In terms of network architectures, our theory at least applies to fully-connected neural networks, convolutional neural networks (CNN), and residual neural networks (ResNet).},
urldate = {2024-11-21},
publisher = {arXiv},
author = {Allen-Zhu, Zeyuan and Li, Yuanzhi and Song, Zhao},
month = jun,
year = {2019},
note = {arXiv:1811.03962},
keywords = {Computer Science - Data Structures and Algorithms, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Mathematics - Optimization and Control, Statistics - Machine Learning},
}@article{qu_probabilistic_2019,
title = {Probabilistic {Logic} {Neural} {Networks} for {Reasoning}},
url = {http://arxiv.org/abs/1906.08495},
abstract = {Knowledge graph reasoning, which aims at predicting the missing facts through reasoning with the observed facts, is critical to many applications. Such a problem has been widely explored by traditional logic rule-based approaches and recent knowledge graph embedding methods. A principled logic rule-based approach is the Markov Logic Network (MLN), which is able to leverage domain knowledge with first-order logic and meanwhile handle their uncertainty. However, the inference of MLNs is usually very difficult due to the complicated graph structures. Different from MLNs, knowledge graph embedding methods (e.g. TransE, DistMult) learn effective entity and relation embeddings for reasoning, which are much more effective and efficient. However, they are unable to leverage domain knowledge. In this paper, we propose the probabilistic Logic Neural Network (pLogicNet), which combines the advantages of both methods. A pLogicNet defines the joint distribution of all possible triplets by using a Markov logic network with first-order logic, which can be efficiently optimized with the variational EM algorithm. In the E-step, a knowledge graph embedding model is used for inferring the missing triplets, while in the M-step, the weights of logic rules are updated based on both the observed and predicted triplets. Experiments on multiple knowledge graphs prove the effectiveness of pLogicNet over many competitive baselines.},
language = {en},
urldate = {2019-07-09},
journal = {arXiv:1906.08495 [cs, stat]},
author = {Qu, Meng and Tang, Jian},
month = jun,
year = {2019},
note = {arXiv: 1906.08495},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning},
}@inproceedings{DBLP:conf/nips/BerthelotCGPOR19,
author = {David Berthelot and
Nicholas Carlini and
Ian J. Goodfellow and
Nicolas Papernot and
Avital Oliver and
Colin Raffel},
editor = {Hanna M. Wallach and
Hugo Larochelle and
Alina Beygelzimer and
Florence d'Alch{\'{e}}{-}Buc and
Emily B. Fox and
Roman Garnett},
title = {MixMatch: {A} Holistic Approach to Semi-Supervised Learning},
booktitle = {Advances in Neural Information Processing Systems 32: Annual Conference
on Neural Information Processing Systems 2019, NeurIPS 2019, 8-14
December 2019, Vancouver, BC, Canada},
pages = {5050--5060},
year = {2019},
url = {http://papers.nips.cc/paper/8749-mixmatch-a-holistic-approach-to-semi-supervised-learning},
timestamp = {Fri, 06 Mar 2020 16:59:09 +0100},
biburl = {https://dblp.org/rec/conf/nips/BerthelotCGPOR19.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}@article{mcdermott_deep_2018,
title = {Deep {Echo} {State} {Networks} with {Uncertainty} {Quantification} for {Spatio}-{Temporal} {Forecasting}},
url = {http://arxiv.org/abs/1806.10728},
abstract = {Long-lead forecasting for spatio-temporal systems can often entail complex nonlinear dynamics that are difficult to specify it a priori. Current statistical methodologies for modeling these processes are often highly parameterized and thus, challenging to implement from a computational perspective. One potential parsimonious solution to this problem is a method from the dynamical systems and engineering literature referred to as an echo state network (ESN). ESN models use so-called \{{\textbackslash}it reservoir computing\} to efficiently compute recurrent neural network (RNN) forecasts. Moreover, so-called "deep" models have recently been shown to be successful at predicting high-dimensional complex nonlinear processes, particularly those with multiple spatial and temporal scales of variability (such as we often find in spatio-temporal environmental data). Here we introduce a deep ensemble ESN (D-EESN) model. We present two versions of this model for spatio-temporal processes that both produce forecasts and associated measures of uncertainty. The first approach utilizes a bootstrap ensemble framework and the second is developed within a hierarchical Bayesian framework (BD-EESN). This more general hierarchical Bayesian framework naturally accommodates non-Gaussian data types and multiple levels of uncertainties. The methodology is first applied to a data set simulated from a novel non-Gaussian multiscale Lorenz-96 dynamical system simulation model and then to a long-lead United States (U.S.) soil moisture forecasting application.},
urldate = {2019-12-12},
journal = {arXiv:1806.10728 [cs, stat]},
author = {McDermott, Patrick L. and Wikle, Christopher K.},
month = sep,
year = {2018},
note = {arXiv: 1806.10728},
keywords = {Computer Science - Machine Learning, Forecasting, Statistics - Machine Learning, long-lead forecasting}
}@article{tallec_can_2018,
title = {Can recurrent neural networks warp time?},
url = {http://arxiv.org/abs/1804.11188},
abstract = {Successful recurrent models such as long short-term memories (LSTMs) and gated recurrent units (GRUs) use ad hoc gating mechanisms. Empirically these models have been found to improve the learning of medium to long term temporal dependencies and to help with vanishing gradient issues. We prove that learnable gates in a recurrent model formally provide quasi- invariance to general time transformations in the input data. We recover part of the LSTM architecture from a simple axiomatic approach. This result leads to a new way of initializing gate biases in LSTMs and GRUs. Ex- perimentally, this new chrono initialization is shown to greatly improve learning of long term dependencies, with minimal implementation effort.},
urldate = {2022-03-02},
journal = {arXiv:1804.11188 [cs, stat]},
author = {Tallec, Corentin and Ollivier, Yann},
month = mar,
year = {2018},
note = {arXiv: 1804.11188},
keywords = {Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Statistics - Machine Learning},
}@article{haber_stable_2018,
title = {Stable {Architectures} for {Deep} {Neural} {Networks}},
volume = {34},
issn = {0266-5611, 1361-6420},
url = {http://arxiv.org/abs/1705.03341},
doi = {10.1088/1361-6420/aa9a90},
abstract = {Deep neural networks have become invaluable tools for supervised machine learning, e.g., classification of text or images. While often offering superior results over traditional techniques and successfully expressing complicated patterns in data, deep architectures are known to be challenging to design and train such that they generalize well to new data. Critical issues with deep architectures are numerical instabilities in derivative-based learning algorithms commonly called exploding or vanishing gradients. In this paper, we propose new forward propagation techniques inspired by systems of Ordinary Differential Equations (ODE) that overcome this challenge and lead to well-posed learning problems for arbitrarily deep networks.},
language = {en},
number = {1},
urldate = {2023-07-05},
journal = {Inverse Problems},
author = {Haber, Eldad and Ruthotto, Lars},
month = jan,
year = {2018},
note = {512 citations (Semantic Scholar/arXiv) [2023-07-05]
512 citations (Semantic Scholar/DOI) [2023-07-05]
arXiv:1705.03341 [cs, math]},
keywords = {/unread, 68T05, 65L09, 49N90, Computer Science - Machine Learning, I.2.6, Mathematics - Numerical Analysis, Mathematics - Optimization and Control},
pages = {014004},
}@techreport{carlsson_topological_2018,
title = {Topological {Approaches} to {Deep} {Learning}},
url = {http://arxiv.org/abs/1811.01122},
abstract = {We perform topological data analysis on the internal states of convolutional deep neural networks to develop an understanding of the computations that they perform. We apply this understanding to modify the computations so as to (a) speed up computations and (b) improve generalization from one data set of digits to another. One byproduct of the analysis is the production of a geometry on new sets of features on data sets of images, and use this observation to develop a methodology for constructing analogues of CNN's for many other geometries, including the graph structures constructed by topological data analysis.},
number = {arXiv:1811.01122},
urldate = {2022-05-28},
institution = {arXiv},
author = {Carlsson, Gunnar and Gabrielsson, Rickard Brüel},
month = nov,
year = {2018},
doi = {10.48550/arXiv.1811.01122},
note = {arXiv:1811.01122 [cs, math, stat]
type: article},
keywords = {68T05, 55N35, 62-07, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Mathematics - Algebraic Topology, Statistics - Machine Learning},
}@article{velickovic_graph_2018,
title = {Graph {Attention} {Networks}},
url = {http://arxiv.org/abs/1710.10903},
abstract = {We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods' features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).},
urldate = {2022-02-14},
journal = {arXiv:1710.10903 [cs, stat]},
author = {Veličković, Petar and Cucurull, Guillem and Casanova, Arantxa and Romero, Adriana and Liò, Pietro and Bengio, Yoshua},
month = feb,
year = {2018},
note = {02847
8 citations (Inspire/arXiv) [2022-02-15]
arXiv: 1710.10903},
keywords = {Classics, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Social and Information Networks, Statistics - Machine Learning, To Read},
}@article{pratama_autonomous_2018,
title = {Autonomous {Deep} {Learning}: {Incremental} {Learning} of {Denoising} {Autoencoder} for {Evolving} {Data} {Streams}},
shorttitle = {Autonomous {Deep} {Learning}},
url = {http://arxiv.org/abs/1809.09081},
abstract = {The generative learning phase of Autoencoder (AE) and its successor Denosing Autoencoder (DAE) enhances the flexibility of data stream method in exploiting unlabelled samples. Nonetheless, the feasibility of DAE for data stream analytic deserves in-depth study because it characterizes a fixed network capacity which cannot adapt to rapidly changing environments. An automated construction of a denoising autoeconder, namely deep evolving denoising autoencoder (DEVDAN), is proposed in this paper. DEVDAN features an open structure both in the generative phase and in the discriminative phase where input features can be automatically added and discarded on the fly. A network significance (NS) method is formulated in this paper and is derived from the bias-variance concept. This method is capable of estimating the statistical contribution of the network structure and its hidden units which precursors an ideal state to add or prune input features. Furthermore, DEVDAN is free of the problem- specific threshold and works fully in the single-pass learning fashion. The efficacy of DEVDAN is numerically validated using nine non-stationary data stream problems simulated under the prequential test-then-train protocol where DEVDAN is capable of delivering an improvement of classification accuracy to recently published online learning works while having flexibility in the automatic extraction of robust input features and in adapting to rapidly changing environments.},
urldate = {2022-03-19},
journal = {arXiv:1809.09081 [cs, stat]},
author = {Pratama, Mahardhika and Ashfahani, Andri and Ong, Yew Soon and Ramasamy, Savitha and Lughofer, Edwin},
month = sep,
year = {2018},
note = {arXiv: 1809.09081},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{van_der_westhuizen_unreasonable_2018,
title = {The unreasonable effectiveness of the forget gate},
url = {http://arxiv.org/abs/1804.04849},
abstract = {Given the success of the gated recurrent unit, a natural question is whether all the gates of the long short-term memory (LSTM) network are necessary. Previous research has shown that the forget gate is one of the most important gates in the LSTM. Here we show that a forget-gate-only version of the LSTM with chronoinitialized biases, not only provides computational savings but outperforms the standard LSTM on multiple benchmark datasets and competes with some of the best contemporary models. Our proposed network, the JANET, achieves accuracies of 99\% and 92.5\% on the MNIST and pMNIST datasets, outperforming the standard LSTM which yields accuracies of 98.5\% and 91\%.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1804.04849 [cs, stat]},
author = {van der Westhuizen, Jos and Lasenby, Joan},
month = sep,
year = {2018},
note = {arXiv: 1804.04849},
keywords = {/unread, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Statistics - Machine Learning, ⛔ No DOI found},
}@misc{hartmann_eeg-gan_2018-2,
title = {{EEG}-{GAN}: {Generative} adversarial networks for electroencephalograhic ({EEG}) brain signals},
shorttitle = {{EEG}-{GAN}},
url = {http://arxiv.org/abs/1806.01875},
abstract = {Generative adversarial networks (GANs) are recently highly successful in generative applications involving images and start being applied to time series data. Here we describe EEG-GAN as a framework to generate electroencephalographic (EEG) brain signals. We introduce a modification to the improved training of Wasserstein GANs to stabilize training and investigate a range of architectural choices critical for time series generation (most notably up- and down-sampling). For evaluation we consider and compare different metrics such as Inception score, Frechet inception distance and sliced Wasserstein distance, together showing that our EEG-GAN framework generated naturalistic EEG examples. It thus opens up a range of new generative application scenarios in the neuroscientific and neurological context, such as data augmentation in brain-computer interfacing tasks, EEG super-sampling, or restoration of corrupted data segments. The possibility to generate signals of a certain class and/or with specific properties may also open a new avenue for research into the underlying structure of brain signals.},
urldate = {2023-01-15},
publisher = {arXiv},
author = {Hartmann, Kay Gregor and Schirrmeister, Robin Tibor and Ball, Tonio},
month = jun,
year = {2018},
note = {arXiv:1806.01875 [cs, eess, q-bio, stat]
version: 1},
keywords = {Computer Science - Machine Learning, Electrical Engineering and Systems Science - Signal Processing, Quantitative Biology - Neurons and Cognition, Statistics - Machine Learning},
}@article{babiker_introduction_2018,
title = {An {Introduction} to {Deep} {Visual} {Explanation}},
url = {http://arxiv.org/abs/1711.09482},
abstract = {The practical impact of deep learning on complex supervised learning problems has been significant, so much so that almost every Artificial Intelligence problem, or at least a portion thereof, has been somehow recast as a deep learning problem. The applications appeal is significant, but this appeal is increasingly challenged by what some call the challenge of explainability, or more generally the more traditional challenge of debuggability: if the outcomes of a deep learning process produce unexpected results (e.g., less than expected performance of a classifier), then there is little available in the way of theories or tools to help investigate the potential causes of such unexpected behavior, especially when this behavior could impact people’s lives.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1711.09482 [cs, stat]},
author = {Babiker, Housam Khalifa Bashier and Goebel, Randy},
month = mar,
year = {2018},
note = {arXiv: 1711.09482},
keywords = {/unread, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning, ⛔ No DOI found},
}@article{reis_probabilistic_2018,
title = {Probabilistic {Random} {Forest}: {A} machine learning algorithm for noisy datasets},
volume = {1811},
shorttitle = {Probabilistic {Random} {Forest}},
url = {http://adsabs.harvard.edu/abs/2018arXiv181105994R},
abstract = {Machine learning (ML) algorithms become increasingly important in the analysis of astronomical data. However, since most ML algorithms are not designed to take data uncertainties into account, ML based studies are mostly restricted to data with high signal-to-noise ratio. Astronomical datasets of such high-quality are uncommon. In this work we modify the long-established Random Forest (RF) algorithm to take into account uncertainties in the measurements (i.e., features) as well as in the assigned classes (i.e., labels). To do so, the Probabilistic Random Forest (PRF) algorithm treats the features and labels as probability distribution functions, rather than deterministic quantities. We perform a variety of experiments where we inject different types of noise to a dataset, and compare the accuracy of the PRF to that of RF. The PRF outperforms RF in all cases, with a moderate increase in running time. We find an improvement in classification accuracy of up to 10\% in the case of noisy features, and up to 30\% in the case of noisy labels. The PRF accuracy decreased by less then 5\% for a dataset with as many as 45\% misclassified objects, compared to a clean dataset. Apart from improving the prediction accuracy in noisy datasets, the PRF naturally copes with missing values in the data, and outperforms RF when applied to a dataset with different noise characteristics in the training and test sets, suggesting that it can be used for Transfer Learning.},
urldate = {2018-11-20},
journal = {ArXiv e-prints},
author = {Reis, Itamar and Baron, Dalya and Shahaf, Sahar},
month = nov,
year = {2018},
keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Machine Learning},
pages = {arXiv:1811.05994},
}@article{prenger_waveglow_2018,
title = {{WaveGlow}: {A} {Flow}-based {Generative} {Network} for {Speech} {Synthesis}},
shorttitle = {{WaveGlow}},
url = {http://arxiv.org/abs/1811.00002},
abstract = {In this paper we propose WaveGlow: a flow-based network capable of generating high quality speech from mel-spectrograms. WaveGlow combines insights from Glow and WaveNet in order to provide fast, efficient and high-quality audio synthesis, without the need for auto-regression. WaveGlow is implemented using only a single network, trained using only a single cost function: maximizing the likelihood of the training data, which makes the training procedure simple and stable. Our PyTorch implementation produces audio samples at a rate of more than 500 kHz on an NVIDIA V100 GPU. Mean Opinion Scores show that it delivers audio quality as good as the best publicly available WaveNet implementation. All code will be made publicly available online.},
urldate = {2022-02-18},
journal = {arXiv:1811.00002 [cs, eess, stat]},
author = {Prenger, Ryan and Valle, Rafael and Catanzaro, Bryan},
month = oct,
year = {2018},
note = {arXiv: 1811.00002},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Statistics - Machine Learning},
}@article{peng_sfv:_2018,
title = {{SFV}: {Reinforcement} {Learning} of {Physical} {Skills} from {Videos}},
shorttitle = {{SFV}},
url = {http://arxiv.org/abs/1810.03599},
abstract = {Data-driven character animation based on motion capture can produce highly naturalistic behaviors and, when combined with physics simulation, can provide for natural procedural responses to physical perturbations, environmental changes, and morphological discrepancies. Motion capture remains the most popular source of motion data, but collecting mocap data typically requires heavily instrumented environments and actors. In this paper, we propose a method that enables physically simulated characters to learn skills from videos (SFV). Our approach, based on deep pose estimation and deep reinforcement learning, allows data-driven animation to leverage the abundance of publicly available video clips from the web, such as those from YouTube. This has the potential to enable fast and easy design of character controllers simply by querying for video recordings of the desired behavior. The resulting controllers are robust to perturbations, can be adapted to new settings, can perform basic object interactions, and can be retargeted to new morphologies via reinforcement learning. We further demonstrate that our method can predict potential human motions from still images, by forward simulation of learned controllers initialized from the observed pose. Our framework is able to learn a broad range of dynamic skills, including locomotion, acrobatics, and martial arts.},
urldate = {2018-10-09},
journal = {arXiv:1810.03599 [cs]},
author = {Peng, Xue Bin and Kanazawa, Angjoo and Malik, Jitendra and Abbeel, Pieter and Levine, Sergey},
month = oct,
year = {2018},
note = {arXiv: 1810.03599},
keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics, Computer Science - Machine Learning}
}@article{zaheer_deep_2018,
title = {Deep {Sets}},
url = {http://arxiv.org/abs/1703.06114},
abstract = {We study the problem of designing models for machine learning tasks defined on {\textbackslash}emph\{sets\}. In contrast to traditional approach of operating on fixed dimensional vectors, we consider objective functions defined on sets that are invariant to permutations. Such problems are widespread, ranging from estimation of population statistics {\textbackslash}cite\{poczos13aistats\}, to anomaly detection in piezometer data of embankment dams {\textbackslash}cite\{Jung15Exploration\}, to cosmology {\textbackslash}cite\{Ntampaka16Dynamical,Ravanbakhsh16ICML1\}. Our main theorem characterizes the permutation invariant functions and provides a family of functions to which any permutation invariant objective function must belong. This family of functions has a special structure which enables us to design a deep network architecture that can operate on sets and which can be deployed on a variety of scenarios including both unsupervised and supervised learning tasks. We also derive the necessary and sufficient conditions for permutation equivariance in deep models. We demonstrate the applicability of our method on population statistic estimation, point cloud classification, set expansion, and outlier detection.},
urldate = {2022-03-02},
journal = {arXiv:1703.06114 [cs, stat]},
author = {Zaheer, Manzil and Kottur, Satwik and Ravanbakhsh, Siamak and Poczos, Barnabas and Salakhutdinov, Ruslan and Smola, Alexander},
month = apr,
year = {2018},
note = {arXiv: 1703.06114},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{yu_performance_2018,
title = {Performance {Estimation} of {Synthesis} {Flows} cross {Technologies} using {LSTMs} and {Transfer} {Learning}},
url = {http://arxiv.org/abs/1811.06017},
abstract = {Due to the increasing complexity of Integrated Circuits (ICs) and System-on-Chip (SoC), developing high-quality synthesis flows within a short market time becomes more challenging. We propose a general approach that precisely estimates the Quality-of-Result (QoR), such as delay and area, of unseen synthesis flows for specific designs. The main idea is training a Recurrent Neural Network (RNN) regressor, where the flows are inputs and QoRs are ground truth. The RNN regressor is constructed with Long Short-Term Memory (LSTM) and fully-connected layers. This approach is demonstrated with 1.2 million data points collected using 14nm, 7nm regular-voltage (RVT), and 7nm low-voltage (LVT) FinFET technologies with twelve IC designs. The accuracy of predicting the QoRs (delay and area) within one technology is \${\textbackslash}boldsymbol\{{\textbackslash}geq\}\${\textbackslash}textbf\{98.0\}{\textbackslash}\% over \${\textbackslash}sim\$240,000 test points. To enable accurate predictions cross different technologies and different IC designs, we propose a transfer-learning approach that utilizes the model pre-trained with 14nm datasets. Our transfer learning approach obtains estimation accuracy \${\textbackslash}geq\$96.3{\textbackslash}\% over \${\textbackslash}sim\$960,000 test points, using only 100 data points for training.},
urldate = {2019-03-16},
journal = {arXiv:1811.06017 [cs, stat]},
author = {Yu, Cunxi and Zhou, Wang},
month = nov,
year = {2018},
note = {arXiv: 1811.06017},
keywords = {\#broken, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Jab/\#Pre, Statistics - Machine Learning, ⛔ No DOI found},
}@article{bach_snorkel_2018,
title = {Snorkel {DryBell}: {A} {Case} {Study} in {Deploying} {Weak} {Supervision} at {Industrial} {Scale}},
shorttitle = {Snorkel {DryBell}},
url = {http://arxiv.org/abs/1812.00417},
abstract = {Labeling training data is one of the most costly bottlenecks in developing machine learning-based applications. We present a first-of-its-kind study showing how existing knowledge resources from across an organization can be used as weak supervision in order to bring development time and cost down by an order of magnitude, and introduce Snorkel DryBell, a new weak supervision management system for this setting. Snorkel DryBell builds on the Snorkel framework, extending it in three critical aspects: flexible, template-based ingestion of diverse organizational knowledge, cross-feature production serving, and scalable, sampling-free execution. On three classification tasks at Google, we find that Snorkel DryBell creates classifiers of comparable quality to ones trained with tens of thousands of hand-labeled examples, converts non-servable organizational resources to servable models for an average 52\% performance improvement, and executes over millions of data points in tens of minutes.},
urldate = {2019-08-09},
journal = {arXiv:1812.00417 [cs, stat]},
author = {Bach, Stephen H. and Rodriguez, Daniel and Liu, Yintao and Luo, Chong and Shao, Haidong and Xia, Cassandra and Sen, Souvik and Ratner, Alexander and Hancock, Braden and Alborzi, Houman and Kuchhal, Rahul and Ré, Christopher and Malkin, Rob},
month = dec,
year = {2018},
note = {arXiv: 1812.00417},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}@article{
title = {Hyperspherical Variational Auto-Encoders},
type = {article},
year = {2018},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
websites = {http://arxiv.org/abs/1804.00891},
month = {9},
id = {80a3859e-084c-318b-8598-fbaedd606332},
created = {2022-03-28T09:45:03.638Z},
accessed = {2022-03-26},
file_attached = {true},
profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9},
group_id = {1ff583c0-be37-34fa-9c04-73c69437d354},
last_modified = {2022-03-29T08:05:33.433Z},
read = {false},
starred = {false},
authored = {false},
confirmed = {true},
hidden = {false},
citation_key = {davidsonHypersphericalVariationalAutoEncoders2018},
source_type = {article},
notes = {arXiv: 1804.00891},
private_publication = {false},
abstract = {The Variational Auto-Encoder (VAE) is one of the most used unsupervised machine learning models. But although the default choice of a Gaussian distribution for both the prior and posterior represents a mathematically convenient distribution often leading to competitive results, we show that this parameterization fails to model data with a latent hyperspherical structure. To address this issue we propose using a von Mises-Fisher (vMF) distribution instead, leading to a hyperspherical latent space. Through a series of experiments we show how such a hyperspherical VAE, or \$\textbackslashmathcal\S\\$-VAE, is more suitable for capturing data with a hyperspherical latent structure, while outperforming a normal, \$\textbackslashmathcal\N\\$-VAE, in low dimensions on other data types.},
bibtype = {article},
author = {Davidson, Tim R and Falorsi, Luca and De Cao, Nicola and Kipf, Thomas and Tomczak, Jakub M},
journal = {arXiv:1804.00891 [cs, stat]}
}@misc{gao_estimating_2018,
title = {Estimating {Mutual} {Information} for {Discrete}-{Continuous} {Mixtures}},
url = {http://arxiv.org/abs/1709.06212},
doi = {10.48550/arXiv.1709.06212},
abstract = {Estimating mutual information from observed samples is a basic primitive, useful in several machine learning tasks including correlation mining, information bottleneck clustering, learning a Chow-Liu tree, and conditional independence testing in (causal) graphical models. While mutual information is a well-defined quantity in general probability spaces, existing estimators can only handle two special cases of purely discrete or purely continuous pairs of random variables. The main challenge is that these methods first estimate the (differential) entropies of X, Y and the pair (X;Y) and add them up with appropriate signs to get an estimate of the mutual information. These 3H-estimators cannot be applied in general mixture spaces, where entropy is not well-defined. In this paper, we design a novel estimator for mutual information of discrete-continuous mixtures. We prove that the proposed estimator is consistent. We provide numerical experiments suggesting superiority of the proposed estimator compared to other heuristics of adding small continuous noise to all the samples and applying standard estimators tailored for purely continuous variables, and quantizing the samples and applying standard estimators tailored for purely discrete variables. This significantly widens the applicability of mutual information estimation in real-world applications, where some variables are discrete, some continuous, and others are a mixture between continuous and discrete components.},
urldate = {2023-03-03},
publisher = {arXiv},
author = {Gao, Weihao and Kannan, Sreeram and Oh, Sewoong and Viswanath, Pramod},
month = oct,
year = {2018},
note = {arXiv:1709.06212 [cs, math]},
keywords = {Computer Science - Machine Learning, Computer Science - Information Theory},
annote = {Comment: 25 pages, 3 figures. Part of this paper appears in the Conference on Neural Information Processing Systems (NIPS), 2017},
file = {arXiv Fulltext PDF:/Users/soumikp/Zotero/storage/KHK25ESM/Gao et al. - 2018 - Estimating Mutual Information for Discrete-Continu.pdf:application/pdf;arXiv.org Snapshot:/Users/soumikp/Zotero/storage/CCBYJK6U/1709.html:text/html},
}@article{madry_towards_2017,
title = {Towards {Deep} {Learning} {Models} {Resistant} to {Adversarial} {Attacks}},
url = {http://arxiv.org/abs/1706.06083},
abstract = {Recent work has demonstrated that neural networks are vulnerable to adversarial examples, i.e., inputs that are almost indistinguishable from natural data and yet classified incorrectly by the network. In fact, some of the latest findings suggest that the existence of adversarial attacks may be an inherent weakness of deep learning models. To address this problem, we study the adversarial robustness of neural networks through the lens of robust optimization. This approach provides us with a broad and unifying view on much of the prior work on this topic. Its principled nature also enables us to identify methods for both training and attacking neural networks that are reliable and, in a certain sense, universal. In particular, they specify a concrete security guarantee that would protect against any adversary. These methods let us train networks with significantly improved resistance to a wide range of adversarial attacks. They also suggest the notion of security against a first-order adversary as a natural and broad security guarantee. We believe that robustness against such well-defined classes of adversaries is an important stepping stone towards fully resistant deep learning models.},
urldate = {2019-06-20},
journal = {arXiv:1706.06083 [cs, stat]},
author = {Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian},
month = jun,
year = {2017},
note = {arXiv: 1706.06083},
keywords = {\#broken, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Jab/\#Pre, Statistics - Machine Learning, ⛔ No DOI found},
}@misc{bellemare_distributional_2017,
title = {A {Distributional} {Perspective} on {Reinforcement} {Learning}},
url = {http://arxiv.org/abs/1707.06887},
abstract = {In this paper we argue for the fundamental importance of the value distribution: the distribution of the random return received by a reinforcement learning agent. This is in contrast to the common approach to reinforcement learning which models the expectation of this return, or value. Although there is an established body of literature studying the value distribution, thus far it has always been used for a specific purpose such as implementing risk-aware behaviour. We begin with theoretical results in both the policy evaluation and control settings, exposing a significant distributional instability in the latter. We then use the distributional perspective to design a new algorithm which applies Bellman’s equation to the learning of approximate value distributions. We evaluate our algorithm using the suite of games from the Arcade Learning Environment. We obtain both state-of-the-art results and anecdotal evidence demonstrating the importance of the value distribution in approximate reinforcement learning. Finally, we combine theoretical and empirical evidence to highlight the ways in which the value distribution impacts learning in the approximate setting.},
language = {en},
urldate = {2023-10-13},
publisher = {arXiv},
author = {Bellemare, Marc G. and Dabney, Will and Munos, Rémi},
month = jul,
year = {2017},
note = {arXiv:1707.06887 [cs, stat]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning},
}@misc{zhang_understanding_2017,
title = {Understanding deep learning requires rethinking generalization},
url = {http://arxiv.org/abs/1611.03530},
doi = {10.48550/arXiv.1611.03530},
abstract = {Despite their massive size, successful deep artificial neural networks can exhibit a remarkably small difference between training and test performance. Conventional wisdom attributes small generalization error either to properties of the model family, or to the regularization techniques used during training. Through extensive systematic experiments, we show how these traditional approaches fail to explain why large neural networks generalize well in practice. Specifically, our experiments establish that state-of-the-art convolutional networks for image classification trained with stochastic gradient methods easily fit a random labeling of the training data. This phenomenon is qualitatively unaffected by explicit regularization, and occurs even if we replace the true images by completely unstructured random noise. We corroborate these experimental findings with a theoretical construction showing that simple depth two neural networks already have perfect finite sample expressivity as soon as the number of parameters exceeds the number of data points as it usually does in practice. We interpret our experimental findings by comparison with traditional models.},
publisher = {arXiv},
author = {Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
month = feb,
year = {2017},
note = {arXiv:1611.03530 [cs]},
keywords = {Computer Science - Machine Learning},
annote = {Comment: Published in ICLR 2017},
file = {Preprint PDF:/Users/peerherholz/Zotero/storage/SHXGIPHG/Zhang et al. - 2017 - Understanding deep learning requires rethinking generalization.pdf:application/pdf;Snapshot:/Users/peerherholz/Zotero/storage/CKW3BV4H/1611.html:text/html},
}@article{goyal_nonparametric_2017,
title = {Nonparametric {Variational} {Auto}-encoders for {Hierarchical} {Representation} {Learning}},
url = {http://arxiv.org/abs/1703.07027},
abstract = {The recently developed variational autoencoders (VAEs) have proved to be an effective confluence of the rich representational power of neural networks with Bayesian methods. However, most work on VAEs use a rather simple prior over the latent variables such as standard normal distribution, thereby restricting its applications to relatively simple phenomena. In this work, we propose hierarchical nonparametric variational autoencoders, which combines treestructured Bayesian nonparametric priors with VAEs, to enable infinite flexibility of the latent representation space. Both the neural parameters and Bayesian priors are learned jointly using tailored variational inference. The resulting model induces a hierarchical structure of latent semantic concepts underlying the data corpus, and infers accurate representations of data instances. We apply our model in video representation learning. Our method is able to discover highly interpretable activity hierarchies, and obtain improved clustering accuracy and generalization capacity based on the learned rich representations.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1703.07027 [cs, stat]},
author = {Goyal, Prasoon and Hu, Zhiting and Liang, Xiaodan and Wang, Chenyu and Xing, Eric},
month = aug,
year = {2017},
note = {arXiv: 1703.07027},
keywords = {/unread, Computer Science - Machine Learning, Statistics - Machine Learning, ⛔ No DOI found},
}@article{laroche_multi-advisor_2017,
title = {Multi-{Advisor} {Reinforcement} {Learning}},
url = {http://arxiv.org/abs/1704.00756},
abstract = {We consider tackling a single-agent RL problem by distributing it to n learners. These learners, called advisors, endeavour to solve the problem from a different focus. Their advice, taking the form of action values, is then communicated to an aggregator, which is in control of the system. We show that the local planning method for the advisors is critical and that none of the ones found in the literature is flawless: the egocentric planning overestimates values of states where the other advisors disagree, and the agnostic planning is inefficient around danger zones. We introduce a novel approach called empathic and discuss its theoretical aspects. We empirically examine and validate our theoretical findings on a fruit collection task.},
language = {en},
urldate = {2019-06-18},
journal = {arXiv:1704.00756 [cs, stat]},
author = {Laroche, Romain and Fatemi, Mehdi and Romoff, Joshua and van Seijen, Harm},
month = apr,
year = {2017},
note = {arXiv: 1704.00756},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Statistics - Machine Learning}
}@article{wang_tacotron_2017,
title = {Tacotron: {Towards} {End}-to-{End} {Speech} {Synthesis}},
shorttitle = {Tacotron},
url = {http://arxiv.org/abs/1703.10135},
abstract = {A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module. Building these components often requires extensive domain expertise and may contain brittle design choices. In this paper, we present Tacotron, an end-to-end generative text-to-speech model that synthesizes speech directly from characters. Given {\textless}text, audio{\textgreater} pairs, the model can be trained completely from scratch with random initialization. We present several key techniques to make the sequence-tosequence framework perform well for this challenging task. Tacotron achieves a 3.82 subjective 5-scale mean opinion score on US English, outperforming a production parametric system in terms of naturalness. In addition, since Tacotron generates speech at the frame level, it’s substantially faster than sample-level autoregressive methods.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1703.10135 [cs]},
author = {Wang, Yuxuan and Skerry-Ryan, R. J. and Stanton, Daisy and Wu, Yonghui and Weiss, Ron J. and Jaitly, Navdeep and Yang, Zongheng and Xiao, Ying and Chen, Zhifeng and Bengio, Samy and Le, Quoc and Agiomyrgiannakis, Yannis and Clark, Rob and Saurous, Rif A.},
month = apr,
year = {2017},
note = {arXiv: 1703.10135},
keywords = {/unread, Computer Science - Computation and Language, Computer Science - Machine Learning, Computer Science - Sound, ⛔ No DOI found},
}@article{sundararajan_axiomatic_2017,
title = {Axiomatic {Attribution} for {Deep} {Networks}},
url = {http://arxiv.org/abs/1703.01365},
abstract = {We study the problem of attributing the prediction of a deep network to its input features, a problem previously studied by several other works. We identify two fundamental axioms—Sensitivity and Implementation Invariance that attribution methods ought to satisfy. We show that they are not satisfied by most known attribution methods, which we consider to be a fundamental weakness of those methods. We use the axioms to guide the design of a new attribution method called Integrated Gradients. Our method requires no modification to the original network and is extremely simple to implement; it just needs a few calls to the standard gradient operator. We apply this method to a couple of image models, a couple of text models and a chemistry model, demonstrating its ability to debug networks, to extract rules from a network, and to enable users to engage with models better.},
language = {en},
urldate = {2022-03-02},
journal = {arXiv:1703.01365 [cs]},
author = {Sundararajan, Mukund and Taly, Ankur and Yan, Qiqi},
month = jun,
year = {2017},
note = {arXiv: 1703.01365},
keywords = {Computer Science - Machine Learning},
}@misc{ba_layer_2016,
title = {Layer {Normalization}},
url = {http://arxiv.org/abs/1607.06450},
abstract = {Training state-of-the-art, deep neural networks is computationally expensive. One way to reduce the training time is to normalize the activities of the neurons. A recently introduced technique called batch normalization uses the distribution of the summed input to a neuron over a mini-batch of training cases to compute a mean and variance which are then used to normalize the summed input to that neuron on each training case. This significantly reduces the training time in feed-forward neural networks. However, the effect of batch normalization is dependent on the mini-batch size and it is not obvious how to apply it to recurrent neural networks. In this paper, we transpose batch normalization into layer normalization by computing the mean and variance used for normalization from all of the summed inputs to the neurons in a layer on a single training case. Like batch normalization, we also give each neuron its own adaptive bias and gain which are applied after the normalization but before the non-linearity. Unlike batch normalization, layer normalization performs exactly the same computation at training and test times. It is also straightforward to apply to recurrent neural networks by computing the normalization statistics separately at each time step. Layer normalization is very effective at stabilizing the hidden state dynamics in recurrent networks. Empirically, we show that layer normalization can substantially reduce the training time compared with previously published techniques.},
language = {en},
urldate = {2023-06-13},
publisher = {arXiv},
author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
month = jul,
year = {2016},
note = {arXiv:1607.06450 [cs, stat]
rate: 5},
keywords = {\#⭐⭐⭐⭐⭐, /unread, Computer Science - Machine Learning, Statistics - Machine Learning, {\textbackslash}unread, ⭐⭐⭐⭐⭐},
}@article{bradbury_quasi-recurrent_2016,
title = {Quasi-{Recurrent} {Neural} {Networks}},
url = {http://arxiv.org/abs/1611.01576},
abstract = {Recurrent neural networks are a powerful tool for modeling sequential data, but the dependence of each timestep’s computation on the previous timestep’s output limits parallelism and makes RNNs unwieldy for very long sequences. We introduce quasi-recurrent neural networks (QRNNs), an approach to neural sequence modeling that alternates convolutional layers, which apply in parallel across timesteps, and a minimalist recurrent pooling function that applies in parallel across channels. Despite lacking trainable recurrent layers, stacked QRNNs have better predictive accuracy than stacked LSTMs of the same hidden size. Due to their increased parallelism, they are up to 16 times faster at train and test time. Experiments on language modeling, sentiment classification, and character-level neural machine translation demonstrate these advantages and underline the viability of QRNNs as a basic building block for a variety of sequence tasks.},
language = {en},
urldate = {2022-01-19},
journal = {arXiv:1611.01576 [cs]},
author = {Bradbury, James and Merity, Stephen and Xiong, Caiming and Socher, Richard},
month = nov,
year = {2016},
note = {arXiv: 1611.01576},
keywords = {/unread, Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, ⛔ No DOI found},
}@misc{finn_deep_2016,
title = {Deep {Spatial} {Autoencoders} for {Visuomotor} {Learning}},
url = {http://arxiv.org/abs/1509.06113},
abstract = {Reinforcement learning provides a powerful and flexible framework for automated acquisition of robotic motion skills. However, applying reinforcement learning requires a sufficiently detailed representation of the state, including the configuration of task-relevant objects. We present an approach that automates state-space construction by learning a state representation directly from camera images. Our method uses a deep spatial autoencoder to acquire a set of feature points that describe the environment for the current task, such as the positions of objects, and then learns a motion skill with these feature points using an efficient reinforcement learning method based on local linear models. The resulting controller reacts continuously to the learned feature points, allowing the robot to dynamically manipulate objects in the world with closed-loop control. We demonstrate our method with a PR2 robot on tasks that include pushing a free-standing toy block, picking up a bag of rice using a spatula, and hanging a loop of rope on a hook at various positions. In each task, our method automatically learns to track task-relevant objects and manipulate their configuration with the robot’s arm.},
language = {en},
urldate = {2022-07-26},
publisher = {arXiv},
author = {Finn, Chelsea and Tan, Xin Yu and Duan, Yan and Darrell, Trevor and Levine, Sergey and Abbeel, Pieter},
month = mar,
year = {2016},
note = {arXiv:1509.06113 [cs]},
keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Robotics},
}@article{bouboulis_complex_2015,
title = {Complex {Support} {Vector} {Machines} for {Regression} and {Quaternary} {Classification}},
volume = {26},
issn = {2162-237X, 2162-2388},
url = {http://arxiv.org/abs/1303.2184},
doi = {10.1109/tnnls.2014.2336679},
abstract = {The paper presents a new framework for complex Support Vector Regression as well as Support Vector Machines for quaternary classification. The method exploits the notion of widely linear estimation to model the input-out relation for complex-valued data and considers two cases: a) the complex data are split into their real and imaginary parts and a typical real kernel is employed to map the complex data to a complexified feature space and b) a pure complex kernel is used to directly map the data to the induced complex feature space. The recently developed Wirtinger’s calculus on complex reproducing kernel Hilbert spaces (RKHS) is employed in order to compute the Lagrangian and derive the dual optimization problem. As one of our major results, we prove that any complex SVM/SVR task is equivalent with solving two real SVM/SVR tasks exploiting a specific real kernel which is generated by the chosen complex kernel. In particular, the case of pure complex kernels leads to the generation of new kernels, which have not been considered before. In the classification case, the proposed framework inherently splits the complex space into four parts. This leads naturally in solving the four class-task (quaternary classification), instead of the typical two classes of the real SVM. In turn, this rationale can be used in a multiclass problem as a split-class scenario based on four classes, as opposed to the one-versus-all method; this can lead to significant computational savings. Experiments demonstrate the effectiveness of the proposed framework for regression and classification tasks that involve complex data.},
language = {en},
number = {6},
urldate = {2022-01-19},
journal = {IEEE Transactions on Neural Networks and Learning Systems},
author = {Bouboulis, Pantelis and Theodoridis, Sergios and Mavroforakis, Charalampos and Dalla, Leoni},
month = jun,
year = {2015},
note = {arXiv: 1303.2184},
keywords = {/unread, Computer Science - Machine Learning, Statistics - Machine Learning},
pages = {1260--1274},
}@techreport{yosinski_understanding_2015,
title = {Understanding {Neural} {Networks} {Through} {Deep} {Visualization}},
url = {http://arxiv.org/abs/1506.06579},
abstract = {Recent years have produced great advances in training large, deep neural networks (DNNs), including notable successes in training convolutional neural networks (convnets) to recognize natural images. However, our understanding of how these models work, especially what computations they perform at intermediate layers, has lagged behind. Progress in the field will be further accelerated by the development of better tools for visualizing and interpreting neural nets. We introduce two such tools here. The first is a tool that visualizes the activations produced on each layer of a trained convnet as it processes an image or video (e.g. a live webcam stream). We have found that looking at live activations that change in response to user input helps build valuable intuitions about how convnets work. The second tool enables visualizing features at each layer of a DNN via regularized optimization in image space. Because previous versions of this idea produced less recognizable images, here we introduce several new regularization methods that combine to produce qualitatively clearer, more interpretable visualizations. Both tools are open source and work on a pre-trained convnet with minimal setup.},
number = {arXiv:1506.06579},
urldate = {2022-05-28},
institution = {arXiv},
author = {Yosinski, Jason and Clune, Jeff and Nguyen, Anh and Fuchs, Thomas and Lipson, Hod},
month = jun,
year = {2015},
doi = {10.48550/arXiv.1506.06579},
note = {arXiv:1506.06579 [cs]
type: article},
keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing},
}@misc{synnaeve_weakly_2015,
title = {Weakly {Supervised} {Multi}-{Embeddings} {Learning} of {Acoustic} {Models}},
url = {http://arxiv.org/abs/1412.6645},
doi = {10.48550/arXiv.1412.6645},
abstract = {We trained a Siamese network with multi-task same/different information on a speech dataset, and found that it was possible to share a network for both tasks without a loss in performance. The first task was to discriminate between two same or different words, and the second was to discriminate between two same or different talkers.},
urldate = {2023-01-09},
publisher = {arXiv},
author = {Synnaeve, Gabriel and Dupoux, Emmanuel},
month = apr,
year = {2015},
note = {arXiv:1412.6645 [cs]},
keywords = {Computer Science - Computation and Language, Computer Science - Machine Learning, Computer Science - Sound, I.2.6, I.2.7, I.5.1},
}@article{cho_learning_2014,
title = {Learning {Phrase} {Representations} using {RNN} {Encoder}-{Decoder} for {Statistical} {Machine} {Translation}},
url = {http://arxiv.org/abs/1406.1078},
abstract = {In this paper, we propose a novel neural network model called RNN Encoder-Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixed-length vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder-Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases.},
urldate = {2020-09-16},
journal = {arXiv:1406.1078 [cs, stat]},
author = {Cho, Kyunghyun and van Merrienboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
month = sep,
year = {2014},
note = {arXiv: 1406.1078},
keywords = {\#broken, Computer Science - Computation and Language, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Jab/\#Pre, Statistics - Machine Learning, ⛔ No DOI found},
}@article{chung_empirical_2014,
title = {Empirical {Evaluation} of {Gated} {Recurrent} {Neural} {Networks} on {Sequence} {Modeling}},
url = {http://arxiv.org/abs/1412.3555},
abstract = {In this paper we compare different types of recurrent units in recurrent neural networks (RNNs). Especially, we focus on more sophisticated units that implement a gating mechanism, such as a long short-term memory (LSTM) unit and a recently proposed gated recurrent unit (GRU). We evaluate these recurrent units on the tasks of polyphonic music modeling and speech signal modeling. Our experiments revealed that these advanced recurrent units are indeed better than more traditional recurrent units such as tanh units. Also, we found GRU to be comparable to LSTM.},
urldate = {2022-03-02},
journal = {arXiv:1412.3555 [cs]},
author = {Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun and Bengio, Yoshua},
month = dec,
year = {2014},
note = {arXiv: 1412.3555},
keywords = {Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing},
}@article{mnih_playing_2013,
title = {Playing {Atari} with {Deep} {Reinforcement} {Learning}},
url = {http://arxiv.org/abs/1312.5602},
abstract = {We present the first deep learning model to successfully learn control policies directly from high-dimensional sensory input using reinforcement learning. The model is a convolutional neural network, trained with a variant of Q-learning, whose input is raw pixels and whose output is a value function estimating future rewards. We apply our method to seven Atari 2600 games from the Arcade Learning Environment, with no adjustment of the architecture or learning algorithm. We find that it outperforms all previous approaches on six of the games and surpasses a human expert on three of them.},
urldate = {2019-05-14},
journal = {arXiv:1312.5602 [cs]},
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
month = dec,
year = {2013},
note = {arXiv: 1312.5602},
keywords = {Computer Science - Machine Learning},
}@article{ozairWassersteinDependencyMeasure2019,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1903.11780},
primaryClass = {cs, stat},
title = {Wasserstein {{Dependency Measure}} for {{Representation Learning}}},
url = {http://arxiv.org/abs/1903.11780},
abstract = {Mutual information maximization has emerged as a powerful learning objective for unsupervised representation learning obtaining state-of-the-art performance in applications such as object recognition, speech recognition, and reinforcement learning. However, such approaches are fundamentally limited since a tight lower bound of mutual information requires sample size exponential in the mutual information. This limits the applicability of these approaches for prediction tasks with high mutual information, such as in video understanding or reinforcement learning. In these settings, such techniques are prone to overfit, both in theory and in practice, and capture only a few of the relevant factors of variation. This leads to incomplete representations that are not optimal for downstream tasks. In this work, we empirically demonstrate that mutual information-based representation learning approaches do fail to learn complete representations on a number of designed and real-world tasks. To mitigate these problems we introduce the Wasserstein dependency measure, which learns more complete representations by using the Wasserstein distance instead of the KL divergence in the mutual information estimator. We show that a practical approximation to this theoretically motivated solution, constructed using Lipschitz constraint techniques from the GAN literature, achieves substantially improved results on tasks where incomplete representations are a major challenge.},
urldate = {2019-03-29},
date = {2019-03-27},
keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
author = {Ozair, Sherjil and Lynch, Corey and Bengio, Yoshua and family=Oord, given=Aaron, prefix=van den, useprefix=false and Levine, Sergey and Sermanet, Pierre},
file = {/home/dimitri/Nextcloud/Zotero/storage/ZYQH2Y9K/Ozair et al. - 2019 - Wasserstein Dependency Measure for Representation .pdf;/home/dimitri/Nextcloud/Zotero/storage/PPDX5S4W/1903.html}
}@article{frognerLearningEmbeddingsEntropic2019,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1905.03329},
primaryClass = {cs, stat},
title = {Learning {{Embeddings}} into {{Entropic Wasserstein Spaces}}},
url = {http://arxiv.org/abs/1905.03329},
abstract = {Euclidean embeddings of data are fundamentally limited in their ability to capture latent semantic structures, which need not conform to Euclidean spatial assumptions. Here we consider an alternative, which embeds data as discrete probability distributions in a Wasserstein space, endowed with an optimal transport metric. Wasserstein spaces are much larger and more flexible than Euclidean spaces, in that they can successfully embed a wider variety of metric structures. We exploit this flexibility by learning an embedding that captures semantic information in the Wasserstein distance between embedded distributions. We examine empirically the representational capacity of our learned Wasserstein embeddings, showing that they can embed a wide variety of metric structures with smaller distortion than an equivalent Euclidean embedding. We also investigate an application to word embedding, demonstrating a unique advantage of Wasserstein embeddings: We can visualize the high-dimensional embedding directly, since it is a probability distribution on a low-dimensional space. This obviates the need for dimensionality reduction techniques like t-SNE for visualization.},
urldate = {2019-05-10},
date = {2019-05-08},
keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
author = {Frogner, Charlie and Mirzazadeh, Farzaneh and Solomon, Justin},
file = {/home/dimitri/Nextcloud/Zotero/storage/UAEFXQR2/Frogner et al. - 2019 - Learning Embeddings into Entropic Wasserstein Spac.pdf;/home/dimitri/Nextcloud/Zotero/storage/AY98Y254/1905.html}
}@article{rezendeStochasticBackpropagationApproximate2014,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1401.4082},
primaryClass = {cs, stat},
title = {Stochastic {{Backpropagation}} and {{Approximate Inference}} in {{Deep Generative Models}}},
url = {http://arxiv.org/abs/1401.4082},
abstract = {We marry ideas from deep neural networks and approximate Bayesian inference to derive a generalised class of deep, directed generative models, endowed with a new algorithm for scalable inference and learning. Our algorithm introduces a recognition model to represent approximate posterior distributions, and that acts as a stochastic encoder of the data. We develop stochastic back-propagation -- rules for back-propagation through stochastic variables -- and use this to develop an algorithm that allows for joint optimisation of the parameters of both the generative and recognition model. We demonstrate on several real-world data sets that the model generates realistic samples, provides accurate imputations of missing data and is a useful tool for high-dimensional data visualisation.},
urldate = {2019-01-25},
date = {2014-01-16},
keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Statistics - Computation,Statistics - Methodology,Computer Science - Machine Learning},
author = {Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
file = {/home/dimitri/Nextcloud/Zotero/storage/KQQEGKHT/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/home/dimitri/Nextcloud/Zotero/storage/DHSBCQPC/1401.html}
}@article{hjelmLearningDeepRepresentations2018,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1808.06670},
primaryClass = {cs, stat},
title = {Learning Deep Representations by Mutual Information Estimation and Maximization},
url = {http://arxiv.org/abs/1808.06670},
abstract = {In this work, we perform unsupervised learning of representations by maximizing mutual information between an input and the output of a deep neural network encoder. Importantly, we show that structure matters: incorporating knowledge about locality of the input to the objective can greatly influence a representation's suitability for downstream tasks. We further control characteristics of the representation by matching to a prior distribution adversarially. Our method, which we call Deep InfoMax (DIM), outperforms a number of popular unsupervised learning methods and competes with fully-supervised learning on several classification tasks. DIM opens new avenues for unsupervised learning of representations and is an important step towards flexible formulations of representation-learning objectives for specific end-goals.},
urldate = {2019-04-01},
date = {2018-08-20},
keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
author = {Hjelm, R. Devon and Fedorov, Alex and Lavoie-Marchildon, Samuel and Grewal, Karan and Bachman, Phil and Trischler, Adam and Bengio, Yoshua},
file = {/home/dimitri/Nextcloud/Zotero/storage/36AV9H8R/Hjelm et al. - 2018 - Learning deep representations by mutual informatio.pdf;/home/dimitri/Nextcloud/Zotero/storage/IXX3GXTN/1808.html}
}@article{kunchevaSpectralMultiscaleCommunity2019,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1901.10521},
primaryClass = {physics, stat},
title = {Spectral {{Multi}}-Scale {{Community Detection}} in {{Temporal Networks}} with an {{Application}}},
url = {http://arxiv.org/abs/1901.10521},
abstract = {The analysis of temporal networks has a wide area of applications in a world of technological advances. An important aspect of temporal network analysis is the discovery of community structures. Real data networks are often very large and the communities are observed to have a hierarchical structure referred to as multi-scale communities. Changes in the community structure over time might take place either at one scale or across all scales of the community structure. The multilayer formulation of the modularity maximization (MM) method introduced captures the changing multi-scale community structure of temporal networks. This method introduces a coupling between communities in neighboring time layers by allowing inter-layer connections, while different values of the resolution parameter enable the detection of multi-scale communities. However, the range of this parameter's values must be manually selected. When dealing with real life data, communities at one or more scales can go undiscovered if appropriate parameter ranges are not selected. A novel Temporal Multi-scale Community Detection (TMSCD) method overcomes the obstacles mentioned above. This is achieved by using the spectral properties of the temporal network represented as a multilayer network. In this framework we select automatically the range of relevant scales within which multi-scale community partitions are sought.},
urldate = {2019-01-31},
date = {2019-01-29},
keywords = {Statistics - Machine Learning,Physics - Physics and Society,Computer Science - Social and Information Networks,Computer Science - Machine Learning},
author = {Kuncheva, Zhana and Montana, Giovanni},
file = {/home/dimitri/Nextcloud/Zotero/storage/2ZF7B936/Kuncheva and Montana - 2019 - Spectral Multi-scale Community Detection in Tempor.pdf;/home/dimitri/Nextcloud/Zotero/storage/424VQEEI/1901.html}
}@article{yurochkinHierarchicalOptimalTransport2019,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1906.10827},
primaryClass = {cs, stat},
title = {Hierarchical {{Optimal Transport}} for {{Document Representation}}},
url = {http://arxiv.org/abs/1906.10827},
abstract = {The ability to measure similarity between documents enables intelligent summarization and analysis of large corpora. Past distances between documents suffer from either an inability to incorporate semantic similarities between words or from scalability issues. As an alternative, we introduce hierarchical optimal transport as a meta-distance between documents, where documents are modeled as distributions over topics, which themselves are modeled as distributions over words. We then solve an optimal transport problem on the smaller topic space to compute a similarity score. We give conditions on the topics under which this construction defines a distance, and we relate it to the word mover's distance. We evaluate our technique for \$k\$-NN classification and show better interpretability and scalability with comparable performance to current methods at a fraction of the cost.},
urldate = {2019-06-28},
date = {2019-06-25},
keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Information Retrieval},
author = {Yurochkin, Mikhail and Claici, Sebastian and Chien, Edward and Mirzazadeh, Farzaneh and Solomon, Justin},
file = {/home/dimitri/Nextcloud/Zotero/storage/EJGKCIUG/Yurochkin et al. - 2019 - Hierarchical Optimal Transport for Document Repres.pdf;/home/dimitri/Nextcloud/Zotero/storage/EC9XIVU7/1906.html}
}@article{kawaguchiGeneralizationDeepLearning2017,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1710.05468},
primaryClass = {cs, stat},
title = {Generalization in {{Deep Learning}}},
url = {http://arxiv.org/abs/1710.05468},
abstract = {This paper provides non-vacuous and numerically-tight generalization guarantees for deep learning, as well as theoretical insights into why and how deep learning can generalize well, despite its large capacity, complexity, possible algorithmic instability, nonrobustness, and sharp minima, responding to an open question in the literature. We also propose new open problems and discuss the limitations of our results.},
urldate = {2019-05-14},
date = {2017-10-15},
keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
author = {Kawaguchi, Kenji and Kaelbling, Leslie Pack and Bengio, Yoshua},
file = {/home/dimitri/Nextcloud/Zotero/storage/IGRS7AC2/Kawaguchi et al. - 2017 - Generalization in Deep Learning.pdf;/home/dimitri/Nextcloud/Zotero/storage/TXV7FXKZ/1710.html}
}@article{bahdanauNeuralMachineTranslation2014,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1409.0473},
primaryClass = {cs, stat},
title = {Neural {{Machine Translation}} by {{Jointly Learning}} to {{Align}} and {{Translate}}},
url = {http://arxiv.org/abs/1409.0473},
abstract = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
urldate = {2018-11-03},
date = {2014-09-01},
keywords = {Statistics - Machine Learning,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
file = {/home/dimitri/Nextcloud/Zotero/storage/EDT3ACT8/Bahdanau et al. - 2014 - Neural Machine Translation by Jointly Learning to .pdf;/home/dimitri/Nextcloud/Zotero/storage/APNE596P/1409.html}
}@article{wuTaleThreeProbabilistic2018,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1810.04261},
primaryClass = {cs, stat},
title = {A {{Tale}} of {{Three Probabilistic Families}}: {{Discriminative}}, {{Descriptive}} and {{Generative Models}}},
url = {http://arxiv.org/abs/1810.04261},
shorttitle = {A {{Tale}} of {{Three Probabilistic Families}}},
abstract = {The pattern theory of Grenander is a mathematical framework where the patterns are represented by probability models on random variables of algebraic structures. In this paper, we review three families of probability models, namely, the discriminative models, the descriptive models, and the generative models. A discriminative model is in the form of a classifier. It specifies the conditional probability of the class label given the input signal. The descriptive model specifies the probability distribution of the signal, based on an energy function defined on the signal. A generative model assumes that the signal is generated by some latent variables via a transformation. We shall review these models within a common framework and explore their connections. We shall also review the recent developments that take advantage of the high approximation capacities of deep neural networks.},
urldate = {2018-11-01},
date = {2018-10-09},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning,Computer Science - Machine Learning},
author = {Wu, Ying Nian and Gao, Ruiqi and Han, Tian and Zhu, Song-Chun},
file = {/home/dimitri/Nextcloud/Zotero/storage/GR22VWEA/Wu et al. - 2018 - A Tale of Three Probabilistic Families Discrimina.pdf;/home/dimitri/Nextcloud/Zotero/storage/TYD8LULW/1810.html}
}@article{leikeScalableAgentAlignment2018,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1811.07871},
primaryClass = {cs, stat},
title = {Scalable Agent Alignment via Reward Modeling: A Research Direction},
url = {http://arxiv.org/abs/1811.07871},
shorttitle = {Scalable Agent Alignment via Reward Modeling},
abstract = {One obstacle to applying reinforcement learning algorithms to real-world problems is the lack of suitable reward functions. Designing such reward functions is difficult in part because the user only has an implicit understanding of the task objective. This gives rise to the agent alignment problem: how do we create agents that behave in accordance with the user's intentions? We outline a high-level research direction to solve the agent alignment problem centered around reward modeling: learning a reward function from interaction with the user and optimizing the learned reward function with reinforcement learning. We discuss the key challenges we expect to face when scaling reward modeling to complex and general domains, concrete approaches to mitigate these challenges, and ways to establish trust in the resulting agents.},
urldate = {2019-01-18},
date = {2018-11-19},
keywords = {Statistics - Machine Learning,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
author = {Leike, Jan and Krueger, David and Everitt, Tom and Martic, Miljan and Maini, Vishal and Legg, Shane},
file = {/home/dimitri/Nextcloud/Zotero/storage/LA4VMIPH/Leike et al. - 2018 - Scalable agent alignment via reward modeling a re.pdf;/home/dimitri/Nextcloud/Zotero/storage/D5SRETKG/1811.html}
}